From 210b49cac9b08b11981444141c3d12318231578c Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 30 Apr 2019 08:09:13 -0700 Subject: [PATCH 001/572] Disable pipelined write in atomic flush stress test (#5266) Summary: Since currently pipelined write allows one thread to perform memtable writes while another thread is traversing the `flush_scheduler_`, it will cause an assertion failure in `FlushScheduler::Clear`. To unblock crash recoery tests, we temporarily disable pipelined write when atomic flush is enabled. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5266 Differential Revision: D15142285 Pulled By: riversand963 fbshipit-source-id: a0c20fe4ac543e08feaed602414f982054df7831 --- tools/db_crashtest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a1a9ecb66ea..a27abe8cf50 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -136,6 +136,8 @@ def is_direct_io_supported(dbname): # use small value for write_buffer_size so that RocksDB triggers flush # more frequently "write_buffer_size": 1024 * 1024, + # disable pipelined write when test_atomic_flush is true + "enable_pipelined_write": 0, } From 25810ca9c7158ec71ec27f8dd98b4d61ff88fc66 Mon Sep 17 00:00:00 2001 From: bxq2011hust Date: Tue, 30 Apr 2019 09:30:46 -0700 Subject: [PATCH 002/572] compile gtest only when enable test Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5248 Differential Revision: D15149190 Pulled By: maysamyabandeh fbshipit-source-id: fd6d799e80bb502a7ddbc07032ea87e2e3f1e24f --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb8067d2245..f4feee986c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -461,8 +461,6 @@ include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) find_package(Threads REQUIRED) -add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) - # Main library source code set(SOURCES @@ -843,6 +841,7 @@ endif() option(WITH_TESTS "build with tests" ON) if(WITH_TESTS) + add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) set(TESTS cache/cache_test.cc cache/lru_cache_test.cc From b02d0c238db9278cd45375cb10e32161244fd3c9 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 30 Apr 2019 09:46:40 -0700 Subject: [PATCH 003/572] Init compression dict handle before reading meta-blocks (#5267) Summary: At least one of the meta-block loading functions (`ReadRangeDelBlock`) uses the same block reading function (`NewDataBlockIterator`) as data block reads, which means it uses the dictionary handle. However, the dictionary handle was uninitialized while reading meta-blocks, causing readers to receive an error. This situation was only noticed when `cache_index_and_filter_blocks=true`. This PR initializes the handle to null while reading meta-blocks to prevent the error. It also adds support to `db_stress` / `db_crashtest.py` for `cache_index_and_filter_blocks`. Fixes #5263. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5267 Differential Revision: D15149264 Pulled By: maysamyabandeh fbshipit-source-id: 991d38a306c62db5976778bfb050fa3cd4a0671b --- table/block_based_table_reader.cc | 5 +++++ tools/db_crashtest.py | 1 + tools/db_stress.cc | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index ad088337a19..d6c9ab88796 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -838,6 +838,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->persistent_cache_key_prefix_size), rep->ioptions.statistics); + // Meta-blocks are not dictionary compressed. Explicitly set the dictionary + // handle to null, otherwise it may be seen as uninitialized during the below + // meta-block reads. + rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + // Read metaindex std::unique_ptr meta; std::unique_ptr meta_iter; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a27abe8cf50..6c7fbabbf11 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -25,6 +25,7 @@ default_params = { "acquire_snapshot_one_in": 10000, "block_size": 16384, + "cache_index_and_filter_blocks": lambda: random.randint(0, 1), "cache_size": 1048576, "checkpoint_one_in": 1000000, "compression_type": "snappy", diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 7f8c4b53f7b..2ecd2aa6d13 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -297,6 +297,9 @@ DEFINE_int32(set_in_place_one_in, 0, DEFINE_int64(cache_size, 2LL * KB * KB * KB, "Number of bytes to use as a cache of uncompressed data."); +DEFINE_bool(cache_index_and_filter_blocks, false, + "True if indexes/filters should be cached in block cache."); + DEFINE_bool(use_clock_cache, false, "Replace default LRU block cache with clock cache."); @@ -2578,6 +2581,8 @@ class StressTest { if (FLAGS_options_file.empty()) { BlockBasedTableOptions block_based_options; block_based_options.block_cache = cache_; + block_based_options.cache_index_and_filter_blocks = + FLAGS_cache_index_and_filter_blocks; block_based_options.block_cache_compressed = compressed_cache_; block_based_options.checksum = FLAGS_checksum_type_e; block_based_options.block_size = FLAGS_block_size; From a5debd7ed821489c5f9e87c805fdd5bc30a85388 Mon Sep 17 00:00:00 2001 From: David Palm Date: Tue, 30 Apr 2019 10:08:13 -0700 Subject: [PATCH 004/572] Add rocksdb_property_int_cf (#5268) Summary: Adds the missing `rocksdb_property_int_cf` function to the C API to let consuming libraries avoid parsing strings. Fixes https://github.com/facebook/rocksdb/issues/5249 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5268 Differential Revision: D15149461 Pulled By: maysamyabandeh fbshipit-source-id: e9fe5f1ad7c64066d921dba8473507269b51d331 --- db/c.cc | 12 ++++++++++++ include/rocksdb/c.h | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/db/c.cc b/db/c.cc index 743a88d838e..aac1cf4087c 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1070,6 +1070,18 @@ int rocksdb_property_int( } } +int rocksdb_property_int_cf( + rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, + uint64_t *out_val) { + if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + char* rocksdb_property_value_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index ceb99ebf945..a0ae7ca7785 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -336,6 +336,11 @@ int rocksdb_property_int( rocksdb_t* db, const char* propname, uint64_t *out_val); +/* returns 0 on success, -1 otherwise */ +int rocksdb_property_int_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t *out_val); + extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, const char* propname); From 03c7ae24c20d0123ef3e45077fd683946ff3384d Mon Sep 17 00:00:00 2001 From: Yuqi Gu Date: Tue, 30 Apr 2019 10:56:06 -0700 Subject: [PATCH 005/572] RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 --- Makefile | 6 +++++ src.mk | 5 ++++ util/crc32c.cc | 27 ++++++++++++++++++--- util/crc32c_arm64.cc | 56 ++++++++++++++++++++++++++++++++++++++++++++ util/crc32c_arm64.h | 21 +++++++++++++++++ 5 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 util/crc32c_arm64.cc create mode 100644 util/crc32c_arm64.h diff --git a/Makefile b/Makefile index eee0f9fba02..928046f0050 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,12 @@ CFLAGS += -DHAVE_POWER8 HAVE_POWER8=1 endif +ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1)) +CXXFLAGS += -march=armv8-a+crc +CFLAGS += -march=armv8-a+crc +ARMCRC_SOURCE=1 +endif + # if we're compiling for release, compile without debug code (-DNDEBUG) ifeq ($(DEBUG_LEVEL),0) OPT += -DNDEBUG diff --git a/src.mk b/src.mk index 55b4e3427c6..e3fe5632f87 100644 --- a/src.mk +++ b/src.mk @@ -216,6 +216,11 @@ LIB_SOURCES = \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ +ifeq ($(ARMCRC_SOURCE),1) +LIB_SOURCES +=\ + util/crc32c_arm64.cc +endif + ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1)) LIB_SOURCES_ASM =\ util/crc32c_ppc_asm.S diff --git a/util/crc32c.cc b/util/crc32c.cc index 9e4b65e66e1..e8d4116ff42 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -18,6 +18,8 @@ #include "util/coding.h" #include "util/util.h" +#include "util/crc32c_arm64.h" + #ifdef __powerpc64__ #include "util/crc32c_ppc.h" #include "util/crc32c_ppc_constants.h" @@ -463,6 +465,11 @@ static bool isAltiVec() { } #endif +#if defined(__linux__) && defined(HAVE_ARM64_CRC) +uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) { + return crc32c_arm64(crc, (const unsigned char *)buf, size); +} +#endif std::string IsFastCrc32Supported() { bool has_fast_crc = false; @@ -478,6 +485,14 @@ std::string IsFastCrc32Supported() { has_fast_crc = false; arch = "PPC"; #endif +#elif defined(__linux__) && defined(HAVE_ARM64_CRC) + if (crc32c_runtime_check()) { + has_fast_crc = true; + arch = "Arm64"; + } else { + has_fast_crc = false; + arch = "Arm64"; + } #else has_fast_crc = isSSE42(); arch = "x86"; @@ -1200,7 +1215,15 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { #endif //HAVE_SSE42 && HAVE_PCLMUL static inline Function Choose_Extend() { -#ifndef HAVE_POWER8 +#ifdef HAVE_POWER8 + return isAltiVec() ? ExtendPPCImpl : ExtendImpl; +#elif defined(__linux__) && defined(HAVE_ARM64_CRC) + if(crc32c_runtime_check()) { + return ExtendARMImpl; + } else { + return ExtendImpl; + } +#else if (isSSE42()) { if (isPCLMULQDQ()) { #if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C @@ -1216,8 +1239,6 @@ static inline Function Choose_Extend() { else { return ExtendImpl; } -#else //HAVE_POWER8 - return isAltiVec() ? ExtendPPCImpl : ExtendImpl; #endif } diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc new file mode 100644 index 00000000000..62fabe99e3c --- /dev/null +++ b/util/crc32c_arm64.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/crc32c_arm64.h" + +#if defined(__linux__) && defined(HAVE_ARM64_CRC) + +#include +#include +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +uint32_t crc32c_runtime_check(void) { + uint64_t auxv = getauxval(AT_HWCAP); + return (auxv & HWCAP_CRC32) != 0; +} + +uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, + unsigned len) { + const uint8_t *buf1; + const uint16_t *buf2; + const uint32_t *buf4; + const uint64_t *buf8; + + int64_t length = (int64_t)len; + + crc ^= 0xffffffff; + buf8 = (const uint64_t *)data; + while ((length -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *buf8++); + } + + /* The following is more efficient than the straight loop */ + buf4 = (const uint32_t *)buf8; + if (length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *buf4++); + length -= 4; + } + + buf2 = (const uint16_t *)buf4; + if (length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *buf2++); + length -= 2; + } + + buf1 = (const uint8_t *)buf2; + if (length & sizeof(uint8_t)) + crc = __crc32cb(crc, *buf1); + + crc ^= 0xffffffff; + return crc; +} + +#endif diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h new file mode 100644 index 00000000000..0e77ecd0ef5 --- /dev/null +++ b/util/crc32c_arm64.h @@ -0,0 +1,21 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef UTIL_CRC32C_ARM64_H +#define UTIL_CRC32C_ARM64_H + +#include + +#if defined(__aarch64__) || defined(__AARCH64__) +#ifdef __ARM_FEATURE_CRC32 +#define HAVE_ARM64_CRC +#include +extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len); +extern uint32_t crc32c_runtime_check(void); +#endif +#endif + + +#endif From 36ea379cdc542c81af9d708f04151f8228b0425e Mon Sep 17 00:00:00 2001 From: Fosco Marotto Date: Tue, 30 Apr 2019 15:05:25 -0700 Subject: [PATCH 006/572] Update history and version for future 6.2.0 (#5270) Summary: Update history before branch cut. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5270 Differential Revision: D15153700 Pulled By: gfosco fbshipit-source-id: 2c81e01a2ab965661b1d88209dca74ba0a3756cb --- HISTORY.md | 2 ++ include/rocksdb/version.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 66dd73965ec..4b08ce9d170 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased + +## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 2e8b496819c..7b7d7e86224 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -5,8 +5,8 @@ #pragma once #define ROCKSDB_MAJOR 6 -#define ROCKSDB_MINOR 1 -#define ROCKSDB_PATCH 1 +#define ROCKSDB_MINOR 2 +#define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From 521d234bdabb00bdaf60ebb207f67256deec648d Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 1 May 2019 10:04:21 -0700 Subject: [PATCH 007/572] Revert snap_refresh_nanos feature (#5269) Summary: Our daily stress tests are failing after this feature. Reverting temporarily until we figure the reason for test failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5269 Differential Revision: D15151285 Pulled By: maysamyabandeh fbshipit-source-id: e4002b99690a97df30d4b4b58bf0f61e9591bc6e --- HISTORY.md | 1 - db/c.cc | 5 -- db/compaction_iterator.cc | 60 +++++-------- db/compaction_iterator.h | 60 +------------ db/compaction_job.cc | 5 +- db/compaction_job.h | 31 +++---- db/compaction_job_test.cc | 141 +++--------------------------- db/db_impl.h | 9 +- db/db_impl_compaction_flush.cc | 37 +------- db/snapshot_impl.h | 14 +-- include/rocksdb/c.h | 2 - include/rocksdb/options.h | 11 --- options/cf_options.cc | 2 - options/cf_options.h | 3 - options/options.cc | 4 - options/options_helper.cc | 25 +++--- options/options_settable_test.cc | 1 - options/options_test.cc | 2 - table/mock_table.cc | 14 --- table/mock_table.h | 8 -- util/compaction_job_stats_impl.cc | 3 - 21 files changed, 70 insertions(+), 368 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 4b08ce9d170..2d3fd87c88c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,7 +8,6 @@ * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. * When reading from option file/string/map, customized envs can be filled according to object registry. -* Add an option `snap_refresh_nanos` (default to 0.5s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator. ### Public API Change diff --git a/db/c.cc b/db/c.cc index aac1cf4087c..9f5995a413b 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2226,11 +2226,6 @@ void rocksdb_options_set_max_bytes_for_level_base( opt->rep.max_bytes_for_level_base = n; } -void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt, - uint64_t n) { - opt->rep.snap_refresh_nanos = n; -} - void rocksdb_options_set_level_compaction_dynamic_level_bytes( rocksdb_options_t* opt, unsigned char v) { opt->rep.level_compaction_dynamic_level_bytes = v; diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index bce0b82dbc7..93c2b5fa9e9 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -38,16 +38,14 @@ CompactionIterator::CompactionIterator( CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, - SnapshotListFetchCallback* snap_list_callback) + const SequenceNumber preserve_deletes_seqnum) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, report_detailed_time, expect_valid_internal_key, range_del_agg, std::unique_ptr( compaction ? new CompactionProxy(compaction) : nullptr), - compaction_filter, shutting_down, preserve_deletes_seqnum, - snap_list_callback) {} + compaction_filter, shutting_down, preserve_deletes_seqnum) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -59,8 +57,7 @@ CompactionIterator::CompactionIterator( std::unique_ptr compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, - SnapshotListFetchCallback* snap_list_callback) + const SequenceNumber preserve_deletes_seqnum) : input_(input), cmp_(cmp), merge_helper_(merge_helper), @@ -78,8 +75,7 @@ CompactionIterator::CompactionIterator( current_user_key_sequence_(0), current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), - current_key_committed_(false), - snap_list_callback_(snap_list_callback) { + current_key_committed_(false) { assert(compaction_filter_ == nullptr || compaction_ != nullptr); assert(snapshots_ != nullptr); bottommost_level_ = @@ -87,7 +83,24 @@ CompactionIterator::CompactionIterator( if (compaction_ != nullptr) { level_ptrs_ = std::vector(compaction_->number_levels(), 0); } - ProcessSnapshotList(); + if (snapshots_->size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = true; + earliest_snapshot_iter_ = snapshots_->end(); + earliest_snapshot_ = kMaxSequenceNumber; + latest_snapshot_ = 0; + } else { + visible_at_tip_ = false; + earliest_snapshot_iter_ = snapshots_->begin(); + earliest_snapshot_ = snapshots_->at(0); + latest_snapshot_ = snapshots_->back(); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } +#endif input_->SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } @@ -209,28 +222,6 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, } } -void CompactionIterator::ProcessSnapshotList() { -#ifndef NDEBUG - // findEarliestVisibleSnapshot assumes this ordering. - for (size_t i = 1; i < snapshots_->size(); ++i) { - assert(snapshots_->at(i - 1) < snapshots_->at(i)); - } -#endif - if (snapshots_->size() == 0) { - // optimize for fast path if there are no snapshots - visible_at_tip_ = true; - earliest_snapshot_iter_ = snapshots_->end(); - earliest_snapshot_ = kMaxSequenceNumber; - latest_snapshot_ = 0; - } else { - visible_at_tip_ = false; - earliest_snapshot_iter_ = snapshots_->begin(); - earliest_snapshot_ = snapshots_->at(0); - latest_snapshot_ = snapshots_->back(); - } - released_snapshots_.clear(); -} - void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; @@ -278,13 +269,6 @@ void CompactionIterator::NextFromInput() { // compaction filter). ikey_.user_key is pointing to the copy. if (!has_current_user_key_ || !cmp_->Equal(ikey_.user_key, current_user_key_)) { - num_keys_++; - // Use num_keys_ to reduce the overhead of reading current time - if (snap_list_callback_ && snapshots_->size() && - snap_list_callback_->TimeToRefresh(num_keys_)) { - snap_list_callback_->Refresh(snapshots_, latest_snapshot_); - ProcessSnapshotList(); - } // First occurrence of this user key // Copy key for output key_ = current_key_.SetInternalKey(key_, &ikey_); diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index 6ab43b1becf..a9e7a262071 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -21,53 +21,6 @@ namespace rocksdb { -// This callback can be used to refresh the snapshot list from the db. It -// includes logics to exponentially decrease the refresh rate to limit the -// overhead of refresh. -class SnapshotListFetchCallback { - public: - SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos, - size_t every_nth_key = 1024) - : timer_(env, /*auto restart*/ true), - snap_refresh_nanos_(snap_refresh_nanos), - every_nth_key_minus_one_(every_nth_key - 1) { - assert(every_nth_key > 0); - assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key)))); - } - // Refresh the snapshot list. snapshots will bre replacted with the new list. - // max is the upper bound. Note: this function will acquire the db_mutex_. - virtual void Refresh(std::vector* snapshots, - SequenceNumber max) = 0; - inline bool TimeToRefresh(const size_t key_index) { - // skip the key if key_index % every_nth_key (which is of power 2) is not 0. - if ((key_index & every_nth_key_minus_one_) != 0) { - return false; - } - const uint64_t elapsed = timer_.ElapsedNanos(); - auto ret = elapsed > snap_refresh_nanos_; - // pre-compute the next time threshold - if (ret) { - // inc next refresh period exponentially (by x4) - auto next_refresh_threshold = snap_refresh_nanos_ << 2; - // make sure the shift has not overflown the highest 1 bit - snap_refresh_nanos_ = - std::max(snap_refresh_nanos_, next_refresh_threshold); - } - return ret; - } - static constexpr SnapshotListFetchCallback* kDisabled = nullptr; - - virtual ~SnapshotListFetchCallback() {} - - private: - // Time since the callback was created - StopWatchNano timer_; - // The delay before calling ::Refresh. To be increased exponentially. - uint64_t snap_refresh_nanos_; - // Skip evey nth key. Number n if of power 2. The math will require n-1. - const uint64_t every_nth_key_minus_one_; -}; - class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what @@ -116,8 +69,7 @@ class CompactionIterator { const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - SnapshotListFetchCallback* snap_list_callback = nullptr); + const SequenceNumber preserve_deletes_seqnum = 0); // Constructor with custom CompactionProxy, used for tests. CompactionIterator(InternalIterator* input, const Comparator* cmp, @@ -130,8 +82,7 @@ class CompactionIterator { std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - SnapshotListFetchCallback* snap_list_callback = nullptr); + const SequenceNumber preserve_deletes_seqnum = 0); ~CompactionIterator(); @@ -159,8 +110,6 @@ class CompactionIterator { private: // Processes the input stream to find the next output void NextFromInput(); - // Process snapshots_ and assign related variables - void ProcessSnapshotList(); // Do last preparations before presenting the output to the callee. At this // point this only zeroes out the sequence number if possible for better @@ -195,7 +144,7 @@ class CompactionIterator { InternalIterator* input_; const Comparator* cmp_; MergeHelper* merge_helper_; - std::vector* snapshots_; + const std::vector* snapshots_; // List of snapshots released during compaction. // findEarliestVisibleSnapshot() find them out from return of // snapshot_checker, and make sure they will not be returned as @@ -270,9 +219,6 @@ class CompactionIterator { // Used to avoid purging uncommitted values. The application can specify // uncommitted values by providing a SnapshotChecker object. bool current_key_committed_; - SnapshotListFetchCallback* snap_list_callback_; - // number of distinct keys processed - size_t num_keys_ = 0; bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. diff --git a/db/compaction_job.cc b/db/compaction_job.cc index bc127a4c45c..45221a15512 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -315,7 +315,7 @@ CompactionJob::CompactionJob( const SnapshotChecker* snapshot_checker, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback) + Env::Priority thread_pri) : job_id_(job_id), compact_(new CompactionState(compaction)), compaction_job_stats_(compaction_job_stats), @@ -336,7 +336,6 @@ CompactionJob::CompactionJob( db_mutex_(db_mutex), db_error_handler_(db_error_handler), existing_snapshots_(std::move(existing_snapshots)), - snap_list_callback_(snap_list_callback), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), table_cache_(std::move(table_cache)), @@ -893,7 +892,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, &range_del_agg, sub_compact->compaction, compaction_filter, - shutting_down_, preserve_deletes_seqnum_, snap_list_callback_)); + shutting_down_, preserve_deletes_seqnum_)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { diff --git a/db/compaction_job.h b/db/compaction_job.h index b3a0f2eb4b5..9767985f336 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -57,20 +57,22 @@ class VersionSet; class CompactionJob { public: - CompactionJob( - int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const EnvOptions env_options, VersionSet* versions, - const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, - Directory* db_directory, Directory* output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, - std::shared_ptr table_cache, EventLogger* event_logger, - bool paranoid_file_checks, bool measure_io_stats, - const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback); + CompactionJob(int job_id, Compaction* compaction, + const ImmutableDBOptions& db_options, + const EnvOptions env_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, + CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri); ~CompactionJob(); @@ -150,7 +152,6 @@ class CompactionJob { // entirely within s1 and s2, then the earlier version of k1 can be safely // deleted because that version is not visible in any snapshot. std::vector existing_snapshots_; - SnapshotListFetchCallback* snap_list_callback_; // This is the earliest snapshot that could be used for write-conflict // checking by a transaction. For any user-key newer than this snapshot, we diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 60394cc9735..f05a8ec2ff7 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -5,13 +5,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include -#include #include #include #include @@ -200,13 +194,6 @@ class CompactionJobTest : public testing::Test { } void NewDB() { - DestroyDB(dbname_, Options()); - EXPECT_OK(env_->CreateDirIfMissing(dbname_)); - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_manager_, - &write_controller_)); - compaction_job_stats_.Reset(); - VersionEdit new_db; new_db.SetLogNumber(0); new_db.SetNextFile(2); @@ -243,10 +230,7 @@ class CompactionJobTest : public testing::Test { const std::vector>& input_files, const stl_wrappers::KVMap& expected_results, const std::vector& snapshots = {}, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, - int output_level = 1, bool verify = true, - SnapshotListFetchCallback* snapshot_fetcher = - SnapshotListFetchCallback::kDisabled) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); size_t num_input_files = 0; @@ -263,7 +247,7 @@ class CompactionJobTest : public testing::Test { Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), - compaction_input_files, output_level, 1024 * 1024, + compaction_input_files, 1, 1024 * 1024, 10 * 1024 * 1024, 0, kNoCompression, cfd->ioptions()->compression_opts, 0, {}, true); compaction.SetInputVersion(cfd->current()); @@ -279,7 +263,7 @@ class CompactionJobTest : public testing::Test { nullptr, nullptr, &mutex_, &error_handler_, snapshots, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, - Env::Priority::USER, snapshot_fetcher); + Env::Priority::USER); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); @@ -291,17 +275,15 @@ class CompactionJobTest : public testing::Test { ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); mutex_.Unlock(); - if (verify) { - if (expected_results.size() == 0) { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); - ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); - } else { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); - ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); - mock_table_factory_->AssertLatestFile(expected_results); - } + if (expected_results.size() == 0) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + } else { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); + mock_table_factory_->AssertLatestFile(expected_results); } } @@ -956,105 +938,6 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) { RunCompaction({files}, expected_results); } -// Test the snapshot fetcher in compaction -TEST_F(CompactionJobTest, SnapshotRefresh) { - uint64_t time_seed = env_->NowMicros(); - printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce - Random64 rand(time_seed); - std::vector db_snapshots; - class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback { - public: - SnapshotListFetchCallbackTest(Env* env, Random64& rand, - std::vector* snapshots) - : SnapshotListFetchCallback(env, 0 /*no time delay*/, - 1 /*fetch after each key*/), - rand_(rand), - snapshots_(snapshots) {} - virtual void Refresh(std::vector* snapshots, - SequenceNumber) override { - assert(snapshots->size()); - assert(snapshots_->size()); - assert(snapshots_->size() == snapshots->size()); - if (rand_.OneIn(2)) { - uint64_t release_index = rand_.Uniform(snapshots_->size()); - snapshots_->erase(snapshots_->begin() + release_index); - *snapshots = *snapshots_; - } - } - - private: - Random64 rand_; - std::vector* snapshots_; - } snapshot_fetcher(env_, rand, &db_snapshots); - - std::vector> file1_kvs, file2_kvs; - std::array types = {kTypeValue, kTypeDeletion, - kTypeSingleDeletion}; - SequenceNumber last_seq = 0; - for (int i = 1; i < 100; i++) { - SequenceNumber seq = last_seq + 1; - last_seq = seq; - if (rand.OneIn(2)) { - auto type = types[rand.Uniform(types.size())]; - file1_kvs.push_back( - {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); - } - } - auto file1 = mock::MakeMockFile(file1_kvs); - for (int i = 1; i < 100; i++) { - SequenceNumber seq = last_seq + 1; - last_seq++; - if (rand.OneIn(2)) { - auto type = types[rand.Uniform(types.size())]; - file2_kvs.push_back( - {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); - } - } - auto file2 = mock::MakeMockFile(file2_kvs); - for (SequenceNumber i = 1; i < last_seq + 1; i++) { - if (rand.OneIn(5)) { - db_snapshots.push_back(i); - } - } - - const bool kVerify = true; - const int output_level_0 = 0; - NewDB(); - AddMockFile(file1); - AddMockFile(file2); - SetLastSequence(last_seq); - auto files = cfd_->current()->storage_info()->LevelFiles(0); - // put the output on L0 since it is easier to feed them again to the 2nd - // compaction - RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, - output_level_0, !kVerify, &snapshot_fetcher); - - // Now db_snapshots are changed. Run the compaction again without snapshot - // fetcher but with the updated snapshot list. - compaction_job_stats_.Reset(); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, - output_level_0 + 1, !kVerify); - // The result should be what we get if we run compaction without snapshot - // fetcher on the updated list of snapshots - auto expected = mock_table_factory_->output(); - - NewDB(); - AddMockFile(file1); - AddMockFile(file2); - SetLastSequence(last_seq); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, - output_level_0, !kVerify); - // The 2nd compaction above would get rid of useless delete markers. To get - // the output here exactly as what we got above after two compactions, we also - // run the compaction for 2nd time. - compaction_job_stats_.Reset(); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, - output_level_0 + 1, !kVerify); -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_impl.h b/db/db_impl.h index 623f69ba6ef..9bdb0abdc10 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -561,13 +561,6 @@ class DBImpl : public DB { const SnapshotList& snapshots() const { return snapshots_; } - void LoadSnapshots(std::vector* snap_vector, - SequenceNumber* oldest_write_conflict_snapshot, - const SequenceNumber& max_seq) const { - InstrumentedMutexLock l(mutex()); - snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); - } - const ImmutableDBOptions& immutable_db_options() const { return immutable_db_options_; } @@ -746,7 +739,7 @@ class DBImpl : public DB { // Not thread-safe. void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); - InstrumentedMutex* mutex() const { return &mutex_; } + InstrumentedMutex* mutex() { return &mutex_; } Status NewDB(); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index f16c6111752..49b6c0fd804 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -798,29 +798,6 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, return s; } -class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback { - public: - SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env, - uint64_t snap_refresh_nanos, Logger* info_log) - : SnapshotListFetchCallback(env, snap_refresh_nanos), - db_impl_(db_impl), - info_log_(info_log) {} - virtual void Refresh(std::vector* snapshots, - SequenceNumber max) override { - size_t prev = snapshots->size(); - snapshots->clear(); - db_impl_->LoadSnapshots(snapshots, nullptr, max); - size_t now = snapshots->size(); - ROCKS_LOG_DEBUG(info_log_, - "Compaction snapshot count refreshed from %zu to %zu", prev, - now); - } - - private: - DBImpl* db_impl_; - Logger* info_log_; -}; - Status DBImpl::CompactFiles(const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, @@ -992,9 +969,6 @@ Status DBImpl::CompactFilesImpl( assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJobStats compaction_job_stats; - SnapshotListFetchCallbackImpl fetch_callback( - this, env_, c->mutable_cf_options()->snap_refresh_nanos, - immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -1004,9 +978,7 @@ Status DBImpl::CompactFilesImpl( snapshot_checker, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, Env::Priority::USER, - immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback - : nullptr); + &compaction_job_stats, Env::Priority::USER); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already @@ -2650,9 +2622,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); assert(is_snapshot_supported_ || snapshots_.empty()); - SnapshotListFetchCallbackImpl fetch_callback( - this, env_, c->mutable_cf_options()->snap_refresh_nanos, - immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -2662,9 +2631,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, thread_pri, - immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback - : nullptr); + &compaction_job_stats, thread_pri); compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index f1cf6f4b755..f2610fd18b2 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -91,23 +91,13 @@ class SnapshotList { SequenceNumber* oldest_write_conflict_snapshot = nullptr, const SequenceNumber& max_seq = kMaxSequenceNumber) const { std::vector ret; - GetAll(&ret, oldest_write_conflict_snapshot, max_seq); - return ret; - } - - void GetAll(std::vector* snap_vector, - SequenceNumber* oldest_write_conflict_snapshot = nullptr, - const SequenceNumber& max_seq = kMaxSequenceNumber) const { - std::vector& ret = *snap_vector; - // So far we have no use case that would pass a non-empty vector - assert(ret.size() == 0); if (oldest_write_conflict_snapshot != nullptr) { *oldest_write_conflict_snapshot = kMaxSequenceNumber; } if (empty()) { - return; + return ret; } const SnapshotImpl* s = &list_; while (s->next_ != &list_) { @@ -129,7 +119,7 @@ class SnapshotList { s = s->next_; } - return; + return ret; } // get the sequence number of the most recent snapshot diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index a0ae7ca7785..4b34996a730 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -816,8 +816,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( rocksdb_options_t*, uint64_t); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos( - rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, unsigned char); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ab856bee8e1..4cc2998b2d8 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -269,17 +269,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; - // If non-zero, compactions will periodically refresh the snapshot list. The - // delay for the first refresh is snap_refresh_nanos nano seconds and - // exponentially increases afterwards. When having many short-lived snapshots, - // this option helps reducing the cpu usage of long-running compactions. The - // feature is disabled when max_subcompactions is greater than one. - // - // Default: 0.5s - // - // Dynamically changeable through SetOptions() API - uint64_t snap_refresh_nanos = 500 * 1000 * 1000; // 0.5s - // Disable automatic compactions. Manual compactions can still // be issued on this column family // diff --git a/options/cf_options.cc b/options/cf_options.cc index f7af3f834c9..78accaeb915 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -169,8 +169,6 @@ void MutableCFOptions::Dump(Logger* log) const { target_file_size_multiplier); ROCKS_LOG_INFO(log, " max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); - ROCKS_LOG_INFO(log, " snap_refresh_nanos: %" PRIu64, - snap_refresh_nanos); ROCKS_LOG_INFO(log, " max_bytes_for_level_multiplier: %f", max_bytes_for_level_multiplier); ROCKS_LOG_INFO(log, " ttl: %" PRIu64, diff --git a/options/cf_options.h b/options/cf_options.h index 47fca58fa7d..d0c4390c36d 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -149,7 +149,6 @@ struct MutableCFOptions { target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), - snap_refresh_nanos(options.snap_refresh_nanos), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), @@ -186,7 +185,6 @@ struct MutableCFOptions { target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), - snap_refresh_nanos(0), max_bytes_for_level_multiplier(0), ttl(0), periodic_compaction_seconds(0), @@ -238,7 +236,6 @@ struct MutableCFOptions { uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; - uint64_t snap_refresh_nanos; double max_bytes_for_level_multiplier; uint64_t ttl; uint64_t periodic_compaction_seconds; diff --git a/options/options.cc b/options/options.cc index 900510d01b6..bfe3e313d30 100644 --- a/options/options.cc +++ b/options/options.cc @@ -215,9 +215,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); - ROCKS_LOG_HEADER( - log, " Options.snap_refresh_nanos: %" PRIu64, - snap_refresh_nanos); ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d", level_compaction_dynamic_level_bytes); ROCKS_LOG_HEADER(log, " Options.max_bytes_for_level_multiplier: %f", @@ -493,7 +490,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( write_buffer_size = 2 << 20; target_file_size_base = 2 * 1048576; max_bytes_for_level_base = 10 * 1048576; - snap_refresh_nanos = 0; soft_pending_compaction_bytes_limit = 256 * 1048576; hard_pending_compaction_bytes_limit = 1073741824ul; diff --git a/options/options_helper.cc b/options/options_helper.cc index a973bbfde51..b7781ff6d25 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -177,7 +177,6 @@ ColumnFamilyOptions BuildColumnFamilyOptions( mutable_cf_options.target_file_size_multiplier; cf_opts.max_bytes_for_level_base = mutable_cf_options.max_bytes_for_level_base; - cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos; cf_opts.max_bytes_for_level_multiplier = mutable_cf_options.max_bytes_for_level_multiplier; cf_opts.ttl = mutable_cf_options.ttl; @@ -527,9 +526,9 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, opt_address)); case OptionType::kBlockBasedTableIndexShorteningMode: return ParseEnum( - block_base_table_index_shortening_mode_string_map, value, - reinterpret_cast( - opt_address)); + block_base_table_index_shortening_mode_string_map, value, + reinterpret_cast( + opt_address)); case OptionType::kEncodingType: return ParseEnum( encoding_type_string_map, value, @@ -1667,13 +1666,13 @@ std::unordered_map std::unordered_map OptionsHelper::block_base_table_index_shortening_mode_string_map = { - {"kNoShortening", - BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, - {"kShortenSeparators", - BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, - {"kShortenSeparatorsAndSuccessor", - BlockBasedTableOptions::IndexShorteningMode:: - kShortenSeparatorsAndSuccessor}}; + {"kNoShortening", + BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, + {"kShortenSeparators", + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, + {"kShortenSeparatorsAndSuccessor", + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor}}; std::unordered_map OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, @@ -1911,10 +1910,6 @@ std::unordered_map {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_base)}}, - {"snap_refresh_nanos", - {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, snap_refresh_nanos)}}, {"max_bytes_for_level_multiplier", {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier), OptionType::kDouble, OptionVerificationType::kNormal, true, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 2d6cc11c02e..005b9d53a89 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -415,7 +415,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:" "kSnappyCompression;" "max_bytes_for_level_base=986;" - "snap_refresh_nanos=1000000000;" "bloom_locality=8016;" "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" diff --git a/options/options_test.cc b/options/options_test.cc index ded336dd18d..fbfee311b0a 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -74,7 +74,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"target_file_size_base", "12"}, {"target_file_size_multiplier", "13"}, {"max_bytes_for_level_base", "14"}, - {"snap_refresh_nanos", "1000000000"}, {"level_compaction_dynamic_level_bytes", "true"}, {"max_bytes_for_level_multiplier", "15.0"}, {"max_bytes_for_level_multiplier_additional", "16:17:18"}, @@ -184,7 +183,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast(12)); ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); - ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U); ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); diff --git a/table/mock_table.cc b/table/mock_table.cc index 9b250604803..65a43616969 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -21,12 +21,6 @@ const InternalKeyComparator icmp_(BytewiseComparator()); } // namespace -stl_wrappers::KVMap MakeMockFile( - std::vector> l) { - return stl_wrappers::KVMap(l.begin(), l.end(), - stl_wrappers::LessOfComparator(&icmp_)); -} - stl_wrappers::KVMap MakeMockFile( std::initializer_list> l) { return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); @@ -143,14 +137,6 @@ void MockTableFactory::AssertLatestFile( ParseInternalKey(Slice(key), &ikey); std::cout << ikey.DebugString(false) << " -> " << value << std::endl; } - std::cout << "Expected:" << std::endl; - for (const auto& kv : file_contents) { - ParsedInternalKey ikey; - std::string key, value; - std::tie(key, value) = kv; - ParseInternalKey(Slice(key), &ikey); - std::cout << ikey.DebugString(false) << " -> " << value << std::endl; - } FAIL(); } } diff --git a/table/mock_table.h b/table/mock_table.h index 5bca14644d8..2f123a963cd 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -28,8 +28,6 @@ namespace mock { stl_wrappers::KVMap MakeMockFile( std::initializer_list> l = {}); -stl_wrappers::KVMap MakeMockFile( - std::vector> l); struct MockTableFileSystem { port::Mutex mutex; @@ -186,12 +184,6 @@ class MockTableFactory : public TableFactory { // contents are equal to file_contents void AssertSingleFile(const stl_wrappers::KVMap& file_contents); void AssertLatestFile(const stl_wrappers::KVMap& file_contents); - stl_wrappers::KVMap output() { - assert(!file_system_.files.empty()); - auto latest = file_system_.files.end(); - --latest; - return latest->second; - } private: uint32_t GetAndWriteNextID(WritableFileWriter* file) const; diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc index fe9efd1f092..a1ebc8b9617 100644 --- a/util/compaction_job_stats_impl.cc +++ b/util/compaction_job_stats_impl.cc @@ -40,9 +40,6 @@ void CompactionJobStats::Reset() { file_fsync_nanos = 0; file_prepare_write_nanos = 0; - smallest_output_key_prefix.clear(); - largest_output_key_prefix.clear(); - num_single_del_fallthru = 0; num_single_del_mismatch = 0; } From 4e0f2aadb036c42950abe01fd8a777b576c44331 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Wed, 1 May 2019 10:13:33 -0700 Subject: [PATCH 008/572] DB::Close() to fail when there are unreleased snapshots (#5272) Summary: Sometimes, users might make mistake of not releasing snapshots before closing the DB. This is undocumented use of RocksDB and the behavior is unknown. We return DB::Close() to provide a way to check it for the users. Aborted() will be returned to users when they call DB::Close(). Pull Request resolved: https://github.com/facebook/rocksdb/pull/5272 Differential Revision: D15159713 Pulled By: siying fbshipit-source-id: 39369def612398d9f239d83d396b5a28e5af65cd --- HISTORY.md | 2 ++ db/db_impl.cc | 14 ++++++++++++++ db/db_test2.cc | 15 +++++++++++++++ include/rocksdb/db.h | 10 +++++++--- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2d3fd87c88c..2662cdea016 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Public API Change +* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl.cc b/db/db_impl.cc index c6268d0cb80..3ec9e2ab2d6 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -582,6 +582,12 @@ Status DBImpl::CloseHelper() { ret = s; } } + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } return ret; } @@ -3036,6 +3042,14 @@ DB::~DB() {} Status DBImpl::Close() { if (!closed_) { + { + InstrumentedMutexLock l(&mutex_); + // If there is unreleased snapshot, fail the close call + if (!snapshots_.empty()) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + } + closed_ = true; return CloseImpl(); } diff --git a/db/db_test2.cc b/db/db_test2.cc index 75e7fe4abba..d93beb4477f 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -3738,6 +3738,21 @@ TEST_F(DBTest2, OldStatsInterface) { ASSERT_GT(dos->num_rt, 0); ASSERT_GT(dos->num_mt, 0); } + +TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { + const Snapshot* ss = db_->GetSnapshot(); + + for (auto h : handles_) { + db_->DestroyColumnFamilyHandle(h); + } + handles_.clear(); + + ASSERT_NOK(db_->Close()); + db_->ReleaseSnapshot(ss); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; +} } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 8bec4a56f94..7b49b92c239 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -232,9 +232,13 @@ class DB { // status in case there are any errors. This will not fsync the WAL files. // If syncing is required, the caller must first call SyncWAL(), or Write() // using an empty write batch with WriteOptions.sync=true. - // Regardless of the return status, the DB must be freed. If the return - // status is NotSupported(), then the DB implementation does cleanup in the - // destructor + // Regardless of the return status, the DB must be freed. + // If the return status is Aborted(), closing fails because there is + // unreleased snapshot in the system. In this case, users can release + // the unreleased snapshots and try again and expect it to succeed. For + // other status, recalling Close() will be no-op. + // If the return status is NotSupported(), then the DB implementation does + // cleanup in the destructor virtual Status Close() { return Status::NotSupported(); } // ListColumnFamilies will open the DB specified by argument name From 4479dff208f8880ad853d9d6c52df64d90b6a0c1 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Wed, 1 May 2019 14:23:48 -0700 Subject: [PATCH 009/572] Reduce binary search when reseek into the same data block (#5256) Summary: Right now, when Seek() is called again, RocksDB always does a binary search against the files and index blocks, even if they end up with the same file/block. Improve it as following: 1. in LevelIterator, reseek first try to check the boundary of the current file. If it falls into the same file, skip the binary search to find the file 2. in block based table iterator, reseek skip to reseek the iterator block if the seek key is larger than the current key and lower than the index key (boundary of the current block and the next block). Pull Request resolved: https://github.com/facebook/rocksdb/pull/5256 Differential Revision: D15105072 Pulled By: siying fbshipit-source-id: 39634bdb4a881082451fa39cecd7ecf12160bf80 --- HISTORY.md | 3 + db/db_iterator_test.cc | 98 +++++++++++++++++++++++++++++++ db/version_set.cc | 20 ++++++- table/block.cc | 1 + table/block_based_table_reader.cc | 36 +++++++++--- 5 files changed, 147 insertions(+), 11 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2662cdea016..011ce0a995d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,9 @@ ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +### New Features +* Reduce binary search when iterator reseek into the same data block. + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index ec5fc8006b8..78b387577dd 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2450,6 +2450,104 @@ TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { ASSERT_EQ("a", it->key().ToString()); } +TEST_P(DBIteratorTest, AvoidReseekLevelIterator) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 800; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + std::string random_str = RandomString(&rnd, 180); + + ASSERT_OK(Put("1", random_str)); + ASSERT_OK(Put("2", random_str)); + ASSERT_OK(Put("3", random_str)); + ASSERT_OK(Put("4", random_str)); + // A new block + ASSERT_OK(Put("5", random_str)); + ASSERT_OK(Put("6", random_str)); + ASSERT_OK(Put("7", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("8", random_str)); + ASSERT_OK(Put("9", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + int num_find_file_in_level = 0; + int num_idx_blk_seek = 0; + SyncPoint::GetInstance()->SetCallBack( + "LevelIterator::Seek:BeforeFindFile", + [&](void* /*arg*/) { num_find_file_in_level++; }); + SyncPoint::GetInstance()->SetCallBack( + "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + { + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek("1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("6"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("7"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(3, num_idx_blk_seek); + + iter->Seek("8"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(2, num_find_file_in_level); + // Still re-seek because "8" is the boundary key, which has + // the same user key as the seek key. + ASSERT_EQ(4, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + // Seek backward never triggers the index block seek to be skipped + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(6, num_idx_blk_seek); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/version_set.cc b/db/version_set.cc index fdc07fee0e5..63d5af3af8d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1007,9 +1007,25 @@ class LevelIterator final : public InternalIterator { }; void LevelIterator::Seek(const Slice& target) { - size_t new_file_index = FindFile(icomparator_, *flevel_, target); + // Check whether the seek key fall under the same file + bool need_to_reseek = true; + if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (icomparator_.InternalKeyComparator::Compare( + target, cur_file.largest_key) <= 0 && + icomparator_.InternalKeyComparator::Compare( + target, cur_file.smallest_key) >= 0) { + need_to_reseek = false; + assert(static_cast(FindFile(icomparator_, *flevel_, target)) == + file_index_); + } + } + if (need_to_reseek) { + TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + InitFileIterator(new_file_index); + } - InitFileIterator(new_file_index); if (file_iter_.iter() != nullptr) { file_iter_.Seek(target); } diff --git a/table/block.cc b/table/block.cc index 80bef4a913f..a6cc8d2705f 100644 --- a/table/block.cc +++ b/table/block.cc @@ -381,6 +381,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { } void IndexBlockIter::Seek(const Slice& target) { + TEST_SYNC_POINT("IndexBlockIter::Seek:0"); Slice seek_key = target; if (!key_includes_seq_) { seek_key = ExtractUserKey(target); diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index d6c9ab88796..e39fd2a860d 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2334,17 +2334,35 @@ void BlockBasedTableIterator::Seek(const Slice& target) { return; } - SavePrevIndexValue(); - - index_iter_->Seek(target); - - if (!index_iter_->Valid()) { - ResetDataIter(); - return; + bool need_seek_index = true; + if (block_iter_points_to_real_block_) { + // Reseek. + prev_index_value_ = index_iter_->value(); + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + + if (need_seek_index) { + index_iter_->Seek(target); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); } - InitDataBlock(); - block_iter_.Seek(target); FindKeyForward(); From d51eb0b583fe28ede2b4a6d778de4489433f1bbf Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Wed, 1 May 2019 20:36:09 -0700 Subject: [PATCH 010/572] set snappy compression only when supported (#4325) Summary: Right now `OptimizeLevelStyleCompaction` may set compression type to Snappy even when Snappy is not supported, this may cause errors like "no snappy compression support" Fixes https://github.com/facebook/rocksdb/issues/4283 Pull Request resolved: https://github.com/facebook/rocksdb/pull/4325 Differential Revision: D15125542 Pulled By: miasantreble fbshipit-source-id: 70890b73ababe16752721555dbd290633c2aafac --- options/options.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/options/options.cc b/options/options.cc index bfe3e313d30..5e0c539afb5 100644 --- a/options/options.cc +++ b/options/options.cc @@ -548,7 +548,10 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( if (i < 2) { compression_per_level[i] = kNoCompression; } else { - compression_per_level[i] = kSnappyCompression; + compression_per_level[i] = + LZ4_Supported() + ? kLZ4Compression + : (Snappy_Supported() ? kSnappyCompression : kNoCompression); } } return this; From 434ccf2df4ead37156edc4b45071c17c7fbad3b3 Mon Sep 17 00:00:00 2001 From: anand76 Date: Wed, 1 May 2019 23:04:03 -0700 Subject: [PATCH 011/572] Add option to use MultiGet in db_stress (#5264) Summary: The new option will pick a batch size randomly in the range 1-64. It will then space the keys in the batch by random intervals. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5264 Differential Revision: D15175522 Pulled By: anand1976 fbshipit-source-id: c16baa69d0f1ff4cf53c55c813ddd82c8aeb58fc --- tools/db_crashtest.py | 1 + tools/db_stress.cc | 167 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6c7fbabbf11..62f72f2b5eb 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -65,6 +65,7 @@ "writepercent": 35, "format_version": lambda: random.randint(2, 4), "index_block_restart_interval": lambda: random.choice(range(1, 16)), + "use_multiget" : lambda: random.randint(0, 1), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 2ecd2aa6d13..97755fe962a 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -455,6 +455,9 @@ DEFINE_uint64(snapshot_hold_ops, 0, "If non-zero, then releases snapshots N operations after they're " "acquired."); +DEFINE_bool(use_multiget, false, + "If set, use the batched MultiGet API for reads"); + static bool ValidateInt32Percent(const char* flagname, int32_t value) { if (value < 0 || value>100) { fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", @@ -1725,6 +1728,27 @@ class StressTest { return base_key + thread->rand.Next() % FLAGS_active_width; } + static std::vector GenerateNKeys( + ThreadState* thread, + int num_keys, + uint64_t iteration) { + const double completed_ratio = + static_cast(iteration) / FLAGS_ops_per_thread; + const int64_t base_key = static_cast( + completed_ratio * (FLAGS_max_key - FLAGS_active_width)); + std::vector keys; + keys.reserve(num_keys); + int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width; + keys.push_back(next_key); + for (int i = 1; i < num_keys; ++i) { + // This may result in some duplicate keys + next_key = next_key + thread->rand.Next() % + (FLAGS_active_width - (next_key - base_key)); + keys.push_back(next_key); + } + return keys; + } + static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) { size_t value_sz = ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult; @@ -2162,7 +2186,14 @@ class StressTest { int prob_op = thread->rand.Uniform(100); if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { // OPERATION read - TestGet(thread, read_opts, rand_column_families, rand_keys); + if (FLAGS_use_multiget) { + int num_keys = thread->rand.Uniform(64); + rand_keys = GenerateNKeys(thread, num_keys, i); + TestMultiGet(thread, read_opts, rand_column_families, rand_keys); + i += num_keys - 1; + } else { + TestGet(thread, read_opts, rand_column_families, rand_keys); + } } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { // OPERATION prefix scan // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are @@ -2211,6 +2242,11 @@ class StressTest { const std::vector& rand_column_families, const std::vector& rand_keys) = 0; + virtual std::vector TestMultiGet(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, @@ -2546,6 +2582,8 @@ class StressTest { fprintf(stdout, "Checksum type : %s\n", checksum.c_str()); fprintf(stdout, "Max subcompactions : %" PRIu64 "\n", FLAGS_subcompactions); + fprintf(stdout, "Use MultiGet : %s\n", + FLAGS_use_multiget ? "true" : "false"); const char* memtablerep = ""; switch (FLAGS_rep_factory) { @@ -3012,6 +3050,38 @@ class NonBatchedOpsStressTest : public StressTest { return s; } + virtual std::vector TestMultiGet(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) { + size_t num_keys = rand_keys.size(); + std::vector key_str; + std::vector keys; + std::vector values(num_keys); + std::vector statuses(num_keys); + ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; + + for (size_t i = 0; i < num_keys; ++i) { + key_str.emplace_back(Key(rand_keys[i])); + keys.emplace_back(key_str.back()); + } + db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), + statuses.data()); + for (const auto& s : statuses) { + if (s.ok()) { + // found case + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + // not found case + thread->stats.AddGets(1, 0); + } else { + // errors case + thread->stats.AddErrors(1); + } + } + return statuses; + } + virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, @@ -3532,6 +3602,70 @@ class BatchedOpsStressTest : public StressTest { return s; } + virtual std::vector TestMultiGet(ThreadState* thread, + const ReadOptions& readoptions, + const std::vector& rand_column_families, + const std::vector& rand_keys) { + int num_keys = rand_keys.size(); + std::vector statuses(num_keys); + std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + for (int key = 0; key < 10; ++key) { + std::vector key_slices; + std::vector values(num_keys); + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + std::vector key_str; + std::string from_db; + ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; + + for (int rand_key = 0; rand_key < num_keys; ++rand_key) { + key_str.emplace_back(keys[key] + Key(rand_keys[rand_key])); + key_slices.emplace_back(key_str.back()); + } + db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(), + values.data(), statuses.data()); + for (int i = 0; i < num_keys; i++) { + Status s = statuses[i]; + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + } else { + char expected_prefix = (keys[key])[0]; + char actual_prefix = (values[i])[0]; + if (actual_prefix != expected_prefix) { + fprintf(stderr, "error expected prefix = %c actual = %c\n", + expected_prefix, actual_prefix); + } + std::string str; + str.assign(values[i].data(), values[i].size()); + values[i].Reset(); + str[0] = ' '; // blank out the differing character + values[i].PinSelf(str); + thread->stats.AddGets(1, 1); + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + // Now that we retrieved all values, check that they all match + for (int i = 1; i < num_keys; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + key_str[i].c_str(), + StringToHex(values[0].ToString()).c_str(), + StringToHex(values[i].ToString()).c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + } + } + + return statuses; + } + // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes // of the key. Each of these 10 scans returns a series of values; @@ -3747,6 +3881,37 @@ class AtomicFlushStressTest : public StressTest { return s; } + virtual std::vector TestMultiGet(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) { + int num_keys = rand_keys.size(); + std::vector key_str; + std::vector keys; + std::vector values(num_keys); + std::vector statuses(num_keys); + ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; + + for (int i = 0; i < num_keys; ++i) { + key_str.emplace_back(Key(rand_keys[i])); + keys.emplace_back(key_str.back()); + } + db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), statuses.data()); + for (auto s : statuses) { + if (s.ok()) { + // found case + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + // not found case + thread->stats.AddGets(1, 0); + } else { + // errors case + thread->stats.AddErrors(1); + } + } + return statuses; + } + virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions, const std::vector& rand_column_families, From 5882e847aabe4cd0a90e0cbaf5a5db39a0668322 Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Thu, 2 May 2019 14:24:21 -0700 Subject: [PATCH 012/572] Allow builds of RocksJava debug releases (#5274) Summary: This allows debug releases of RocksJava to be build with the Docker release targets. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5274 Differential Revision: D15185067 Pulled By: sagar0 fbshipit-source-id: f3988e472f281f5844d9a07098344a827b1e7eb1 --- Makefile | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 928046f0050..ee20a41bb1a 100644 --- a/Makefile +++ b/Makefile @@ -82,17 +82,23 @@ ifeq ($(MAKECMDGOALS),rocksdbjavastatic) endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease) - DEBUG_LEVEL=0 + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker) - DEBUG_LEVEL=0 + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) DEBUG_LEVEL=0 endif +$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}) + # Lite build flag. LITE ?= 0 ifeq ($(LITE), 0) @@ -1827,27 +1833,15 @@ rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 roc rocksdbjavastaticdockerx86: mkdir -p java/target - DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \ - if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_x86-be + docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target - DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \ - if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_x64-be + docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerppc64le: mkdir -p java/target - DOCKER_LINUX_PPC64LE_CONTAINER=`docker ps -aqf name=rocksdb_linux_ppc64le-be`; \ - if [ -z "$$DOCKER_LINUX_PPC64LE_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_ppc64le-be evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_ppc64le-be + docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral From 3e994809a1c00ca52fe45e598323e54db18cb90c Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 3 May 2019 09:58:12 -0700 Subject: [PATCH 013/572] fix implicit conversion error reported by clang check (#5277) Summary: fix the following clang check errors ``` tools/db_stress.cc:3609:30: error: implicit conversion loses integer precision: 'std::vector::size_type' (aka 'unsigned long') to 'int' [-Werror,-Wshorten-64-to-32] int num_keys = rand_keys.size(); ~~~~~~~~ ~~~~~~~~~~^~~~~~ tools/db_stress.cc:3888:30: error: implicit conversion loses integer precision: 'std::vector::size_type' (aka 'unsigned long') to 'int' [-Werror,-Wshorten-64-to-32] int num_keys = rand_keys.size(); ~~~~~~~~ ~~~~~~~~~~^~~~~~ 2 errors generated. make: *** [tools/db_stress.o] Error 1 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5277 Differential Revision: D15196620 Pulled By: miasantreble fbshipit-source-id: d56b1420d4a9f1df875fc52877a5fbb342bc7cae --- tools/db_stress.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 97755fe962a..4ed66ed6d75 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -3606,7 +3606,7 @@ class BatchedOpsStressTest : public StressTest { const ReadOptions& readoptions, const std::vector& rand_column_families, const std::vector& rand_keys) { - int num_keys = rand_keys.size(); + size_t num_keys = rand_keys.size(); std::vector statuses(num_keys); std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; for (int key = 0; key < 10; ++key) { @@ -3618,13 +3618,13 @@ class BatchedOpsStressTest : public StressTest { std::string from_db; ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; - for (int rand_key = 0; rand_key < num_keys; ++rand_key) { + for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) { key_str.emplace_back(keys[key] + Key(rand_keys[rand_key])); key_slices.emplace_back(key_str.back()); } db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(), values.data(), statuses.data()); - for (int i = 0; i < num_keys; i++) { + for (size_t i = 0; i < num_keys; i++) { Status s = statuses[i]; if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); @@ -3651,7 +3651,7 @@ class BatchedOpsStressTest : public StressTest { db_->ReleaseSnapshot(readoptionscopy.snapshot); // Now that we retrieved all values, check that they all match - for (int i = 1; i < num_keys; i++) { + for (size_t i = 1; i < num_keys; i++) { if (values[i] != values[0]) { fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", key_str[i].c_str(), @@ -3885,14 +3885,14 @@ class AtomicFlushStressTest : public StressTest { const ReadOptions& read_opts, const std::vector& rand_column_families, const std::vector& rand_keys) { - int num_keys = rand_keys.size(); + size_t num_keys = rand_keys.size(); std::vector key_str; std::vector keys; std::vector values(num_keys); std::vector statuses(num_keys); ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; - for (int i = 0; i < num_keys; ++i) { + for (size_t i = 0; i < num_keys; ++i) { key_str.emplace_back(Key(rand_keys[i])); keys.emplace_back(key_str.back()); } From 5d27d65bef4ec40fb7bb61f4f50817279abd85eb Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 3 May 2019 15:55:48 -0700 Subject: [PATCH 014/572] multiget: fix memory issues due to vector auto resizing (#5279) Summary: This PR fixes three memory issues found by ASAN * in db_stress, the key vector for MultiGet is created using `emplace_back` which could potentially invalidates references to the underlying storage (vector) due to auto resizing. Fix by calling reserve in advance. * Similar issue in construction of GetContext autovector in version_set.cc * In multiget_context.h use T[] specialization for unique_ptr that holds a char array Pull Request resolved: https://github.com/facebook/rocksdb/pull/5279 Differential Revision: D15202893 Pulled By: miasantreble fbshipit-source-id: 14cc2cda0ed64d29f2a1e264a6bfdaa4294ee75d --- db/version_set.cc | 6 +++++- table/multiget_context.h | 2 +- tools/db_stress.cc | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 63d5af3af8d..6d4fb7315ad 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1774,7 +1774,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, iter->value, nullptr, &(iter->merge_context), &iter->max_covering_tombstone_seq, this->env_, &iter->seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob); - iter->get_context = &get_ctx.back(); + } + int get_ctx_index = 0; + for (auto iter = range->begin(); iter != range->end(); + ++iter, get_ctx_index++) { + iter->get_context = &(get_ctx[get_ctx_index]); } MultiGetRange file_picker_range(*range, range->begin(), range->end()); diff --git a/table/multiget_context.h b/table/multiget_context.h index d3a8d09463b..c9e682fad4b 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -123,7 +123,7 @@ class MultiGetContext { KeyContext** sorted_keys_; size_t num_keys_; uint64_t value_mask_; - std::unique_ptr lookup_key_heap_buf; + std::unique_ptr lookup_key_heap_buf; LookupKey* lookup_key_ptr_; public: diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 4ed66ed6d75..c6959802be3 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -3057,6 +3057,8 @@ class NonBatchedOpsStressTest : public StressTest { size_t num_keys = rand_keys.size(); std::vector key_str; std::vector keys; + key_str.reserve(num_keys); + keys.reserve(num_keys); std::vector values(num_keys); std::vector statuses(num_keys); ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; @@ -3615,6 +3617,8 @@ class BatchedOpsStressTest : public StressTest { ReadOptions readoptionscopy = readoptions; readoptionscopy.snapshot = db_->GetSnapshot(); std::vector key_str; + key_str.reserve(num_keys); + key_slices.reserve(num_keys); std::string from_db; ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; @@ -3888,6 +3892,8 @@ class AtomicFlushStressTest : public StressTest { size_t num_keys = rand_keys.size(); std::vector key_str; std::vector keys; + keys.reserve(num_keys); + key_str.reserve(num_keys); std::vector values(num_keys); std::vector statuses(num_keys); ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; From 6a40ee5eb1f3179ad7e56a60d27feeacfcfa4d0c Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 3 May 2019 17:26:20 -0700 Subject: [PATCH 015/572] Refresh snapshot list during long compactions (2nd attempt) (#5278) Summary: Part of compaction cpu goes to processing snapshot list, the larger the list the bigger the overhead. Although the lifetime of most of the snapshots is much shorter than the lifetime of compactions, the compaction conservatively operates on the list of snapshots that it initially obtained. This patch allows the snapshot list to be updated via a callback if the compaction is taking long. This should let the compaction to continue more efficiently with much smaller snapshot list. For simplicity, to avoid the feature is disabled in two cases: i) When more than one sub-compaction are sharing the same snapshot list, ii) when Range Delete is used in which the range delete aggregator has its own copy of snapshot list. This fixes the reverted https://github.com/facebook/rocksdb/pull/5099 issue with range deletes. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5278 Differential Revision: D15203291 Pulled By: maysamyabandeh fbshipit-source-id: fa645611e606aa222c7ce53176dc5bb6f259c258 --- HISTORY.md | 1 + db/c.cc | 5 ++ db/compaction_iterator.cc | 60 ++++++++----- db/compaction_iterator.h | 60 ++++++++++++- db/compaction_job.cc | 7 +- db/compaction_job.h | 31 ++++--- db/compaction_job_test.cc | 141 +++++++++++++++++++++++++++--- db/db_impl.h | 9 +- db/db_impl_compaction_flush.cc | 39 ++++++++- db/snapshot_impl.h | 14 ++- include/rocksdb/c.h | 2 + include/rocksdb/options.h | 11 +++ options/cf_options.cc | 2 + options/cf_options.h | 3 + options/options.cc | 4 + options/options_helper.cc | 25 +++--- options/options_settable_test.cc | 1 + options/options_test.cc | 2 + table/mock_table.cc | 14 +++ table/mock_table.h | 8 ++ tools/db_crashtest.py | 5 +- util/compaction_job_stats_impl.cc | 3 + 22 files changed, 375 insertions(+), 72 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 011ce0a995d..65d64d23604 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ ### New Features * Reduce binary search when iterator reseek into the same data block. +* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/c.cc b/db/c.cc index 9f5995a413b..aac1cf4087c 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2226,6 +2226,11 @@ void rocksdb_options_set_max_bytes_for_level_base( opt->rep.max_bytes_for_level_base = n; } +void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.snap_refresh_nanos = n; +} + void rocksdb_options_set_level_compaction_dynamic_level_bytes( rocksdb_options_t* opt, unsigned char v) { opt->rep.level_compaction_dynamic_level_bytes = v; diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index 93c2b5fa9e9..bce0b82dbc7 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -38,14 +38,16 @@ CompactionIterator::CompactionIterator( CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum) + const SequenceNumber preserve_deletes_seqnum, + SnapshotListFetchCallback* snap_list_callback) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, report_detailed_time, expect_valid_internal_key, range_del_agg, std::unique_ptr( compaction ? new CompactionProxy(compaction) : nullptr), - compaction_filter, shutting_down, preserve_deletes_seqnum) {} + compaction_filter, shutting_down, preserve_deletes_seqnum, + snap_list_callback) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -57,7 +59,8 @@ CompactionIterator::CompactionIterator( std::unique_ptr compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum) + const SequenceNumber preserve_deletes_seqnum, + SnapshotListFetchCallback* snap_list_callback) : input_(input), cmp_(cmp), merge_helper_(merge_helper), @@ -75,7 +78,8 @@ CompactionIterator::CompactionIterator( current_user_key_sequence_(0), current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), - current_key_committed_(false) { + current_key_committed_(false), + snap_list_callback_(snap_list_callback) { assert(compaction_filter_ == nullptr || compaction_ != nullptr); assert(snapshots_ != nullptr); bottommost_level_ = @@ -83,24 +87,7 @@ CompactionIterator::CompactionIterator( if (compaction_ != nullptr) { level_ptrs_ = std::vector(compaction_->number_levels(), 0); } - if (snapshots_->size() == 0) { - // optimize for fast path if there are no snapshots - visible_at_tip_ = true; - earliest_snapshot_iter_ = snapshots_->end(); - earliest_snapshot_ = kMaxSequenceNumber; - latest_snapshot_ = 0; - } else { - visible_at_tip_ = false; - earliest_snapshot_iter_ = snapshots_->begin(); - earliest_snapshot_ = snapshots_->at(0); - latest_snapshot_ = snapshots_->back(); - } -#ifndef NDEBUG - // findEarliestVisibleSnapshot assumes this ordering. - for (size_t i = 1; i < snapshots_->size(); ++i) { - assert(snapshots_->at(i - 1) < snapshots_->at(i)); - } -#endif + ProcessSnapshotList(); input_->SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } @@ -222,6 +209,28 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, } } +void CompactionIterator::ProcessSnapshotList() { +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } +#endif + if (snapshots_->size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = true; + earliest_snapshot_iter_ = snapshots_->end(); + earliest_snapshot_ = kMaxSequenceNumber; + latest_snapshot_ = 0; + } else { + visible_at_tip_ = false; + earliest_snapshot_iter_ = snapshots_->begin(); + earliest_snapshot_ = snapshots_->at(0); + latest_snapshot_ = snapshots_->back(); + } + released_snapshots_.clear(); +} + void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; @@ -269,6 +278,13 @@ void CompactionIterator::NextFromInput() { // compaction filter). ikey_.user_key is pointing to the copy. if (!has_current_user_key_ || !cmp_->Equal(ikey_.user_key, current_user_key_)) { + num_keys_++; + // Use num_keys_ to reduce the overhead of reading current time + if (snap_list_callback_ && snapshots_->size() && + snap_list_callback_->TimeToRefresh(num_keys_)) { + snap_list_callback_->Refresh(snapshots_, latest_snapshot_); + ProcessSnapshotList(); + } // First occurrence of this user key // Copy key for output key_ = current_key_.SetInternalKey(key_, &ikey_); diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index a9e7a262071..6ab43b1becf 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -21,6 +21,53 @@ namespace rocksdb { +// This callback can be used to refresh the snapshot list from the db. It +// includes logics to exponentially decrease the refresh rate to limit the +// overhead of refresh. +class SnapshotListFetchCallback { + public: + SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos, + size_t every_nth_key = 1024) + : timer_(env, /*auto restart*/ true), + snap_refresh_nanos_(snap_refresh_nanos), + every_nth_key_minus_one_(every_nth_key - 1) { + assert(every_nth_key > 0); + assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key)))); + } + // Refresh the snapshot list. snapshots will bre replacted with the new list. + // max is the upper bound. Note: this function will acquire the db_mutex_. + virtual void Refresh(std::vector* snapshots, + SequenceNumber max) = 0; + inline bool TimeToRefresh(const size_t key_index) { + // skip the key if key_index % every_nth_key (which is of power 2) is not 0. + if ((key_index & every_nth_key_minus_one_) != 0) { + return false; + } + const uint64_t elapsed = timer_.ElapsedNanos(); + auto ret = elapsed > snap_refresh_nanos_; + // pre-compute the next time threshold + if (ret) { + // inc next refresh period exponentially (by x4) + auto next_refresh_threshold = snap_refresh_nanos_ << 2; + // make sure the shift has not overflown the highest 1 bit + snap_refresh_nanos_ = + std::max(snap_refresh_nanos_, next_refresh_threshold); + } + return ret; + } + static constexpr SnapshotListFetchCallback* kDisabled = nullptr; + + virtual ~SnapshotListFetchCallback() {} + + private: + // Time since the callback was created + StopWatchNano timer_; + // The delay before calling ::Refresh. To be increased exponentially. + uint64_t snap_refresh_nanos_; + // Skip evey nth key. Number n if of power 2. The math will require n-1. + const uint64_t every_nth_key_minus_one_; +}; + class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what @@ -69,7 +116,8 @@ class CompactionIterator { const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0); + const SequenceNumber preserve_deletes_seqnum = 0, + SnapshotListFetchCallback* snap_list_callback = nullptr); // Constructor with custom CompactionProxy, used for tests. CompactionIterator(InternalIterator* input, const Comparator* cmp, @@ -82,7 +130,8 @@ class CompactionIterator { std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0); + const SequenceNumber preserve_deletes_seqnum = 0, + SnapshotListFetchCallback* snap_list_callback = nullptr); ~CompactionIterator(); @@ -110,6 +159,8 @@ class CompactionIterator { private: // Processes the input stream to find the next output void NextFromInput(); + // Process snapshots_ and assign related variables + void ProcessSnapshotList(); // Do last preparations before presenting the output to the callee. At this // point this only zeroes out the sequence number if possible for better @@ -144,7 +195,7 @@ class CompactionIterator { InternalIterator* input_; const Comparator* cmp_; MergeHelper* merge_helper_; - const std::vector* snapshots_; + std::vector* snapshots_; // List of snapshots released during compaction. // findEarliestVisibleSnapshot() find them out from return of // snapshot_checker, and make sure they will not be returned as @@ -219,6 +270,9 @@ class CompactionIterator { // Used to avoid purging uncommitted values. The application can specify // uncommitted values by providing a SnapshotChecker object. bool current_key_committed_; + SnapshotListFetchCallback* snap_list_callback_; + // number of distinct keys processed + size_t num_keys_ = 0; bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 45221a15512..00386a99ad4 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -315,7 +315,7 @@ CompactionJob::CompactionJob( const SnapshotChecker* snapshot_checker, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri) + Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback) : job_id_(job_id), compact_(new CompactionState(compaction)), compaction_job_stats_(compaction_job_stats), @@ -336,6 +336,7 @@ CompactionJob::CompactionJob( db_mutex_(db_mutex), db_error_handler_(db_error_handler), existing_snapshots_(std::move(existing_snapshots)), + snap_list_callback_(snap_list_callback), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), table_cache_(std::move(table_cache)), @@ -892,7 +893,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, &range_del_agg, sub_compact->compaction, compaction_filter, - shutting_down_, preserve_deletes_seqnum_)); + shutting_down_, preserve_deletes_seqnum_, + // Currently range_del_agg is incompatible with snapshot refresh feature. + range_del_agg.IsEmpty() ? snap_list_callback_ : nullptr)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { diff --git a/db/compaction_job.h b/db/compaction_job.h index 9767985f336..b3a0f2eb4b5 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -57,22 +57,20 @@ class VersionSet; class CompactionJob { public: - CompactionJob(int job_id, Compaction* compaction, - const ImmutableDBOptions& db_options, - const EnvOptions env_options, VersionSet* versions, - const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, - LogBuffer* log_buffer, Directory* db_directory, - Directory* output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, - std::shared_ptr table_cache, EventLogger* event_logger, - bool paranoid_file_checks, bool measure_io_stats, - const std::string& dbname, - CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri); + CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const EnvOptions env_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, + Directory* db_directory, Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback); ~CompactionJob(); @@ -152,6 +150,7 @@ class CompactionJob { // entirely within s1 and s2, then the earlier version of k1 can be safely // deleted because that version is not visible in any snapshot. std::vector existing_snapshots_; + SnapshotListFetchCallback* snap_list_callback_; // This is the earliest snapshot that could be used for write-conflict // checking by a transaction. For any user-key newer than this snapshot, we diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index f05a8ec2ff7..60394cc9735 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -5,7 +5,13 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include +#include #include #include #include @@ -194,6 +200,13 @@ class CompactionJobTest : public testing::Test { } void NewDB() { + DestroyDB(dbname_, Options()); + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_)); + compaction_job_stats_.Reset(); + VersionEdit new_db; new_db.SetLogNumber(0); new_db.SetNextFile(2); @@ -230,7 +243,10 @@ class CompactionJobTest : public testing::Test { const std::vector>& input_files, const stl_wrappers::KVMap& expected_results, const std::vector& snapshots = {}, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + int output_level = 1, bool verify = true, + SnapshotListFetchCallback* snapshot_fetcher = + SnapshotListFetchCallback::kDisabled) { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); size_t num_input_files = 0; @@ -247,7 +263,7 @@ class CompactionJobTest : public testing::Test { Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), - compaction_input_files, 1, 1024 * 1024, + compaction_input_files, output_level, 1024 * 1024, 10 * 1024 * 1024, 0, kNoCompression, cfd->ioptions()->compression_opts, 0, {}, true); compaction.SetInputVersion(cfd->current()); @@ -263,7 +279,7 @@ class CompactionJobTest : public testing::Test { nullptr, nullptr, &mutex_, &error_handler_, snapshots, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, - Env::Priority::USER); + Env::Priority::USER, snapshot_fetcher); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); @@ -275,15 +291,17 @@ class CompactionJobTest : public testing::Test { ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); mutex_.Unlock(); - if (expected_results.size() == 0) { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); - ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); - } else { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); - ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); - mock_table_factory_->AssertLatestFile(expected_results); + if (verify) { + if (expected_results.size() == 0) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + } else { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); + mock_table_factory_->AssertLatestFile(expected_results); + } } } @@ -938,6 +956,105 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) { RunCompaction({files}, expected_results); } +// Test the snapshot fetcher in compaction +TEST_F(CompactionJobTest, SnapshotRefresh) { + uint64_t time_seed = env_->NowMicros(); + printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce + Random64 rand(time_seed); + std::vector db_snapshots; + class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback { + public: + SnapshotListFetchCallbackTest(Env* env, Random64& rand, + std::vector* snapshots) + : SnapshotListFetchCallback(env, 0 /*no time delay*/, + 1 /*fetch after each key*/), + rand_(rand), + snapshots_(snapshots) {} + virtual void Refresh(std::vector* snapshots, + SequenceNumber) override { + assert(snapshots->size()); + assert(snapshots_->size()); + assert(snapshots_->size() == snapshots->size()); + if (rand_.OneIn(2)) { + uint64_t release_index = rand_.Uniform(snapshots_->size()); + snapshots_->erase(snapshots_->begin() + release_index); + *snapshots = *snapshots_; + } + } + + private: + Random64 rand_; + std::vector* snapshots_; + } snapshot_fetcher(env_, rand, &db_snapshots); + + std::vector> file1_kvs, file2_kvs; + std::array types = {kTypeValue, kTypeDeletion, + kTypeSingleDeletion}; + SequenceNumber last_seq = 0; + for (int i = 1; i < 100; i++) { + SequenceNumber seq = last_seq + 1; + last_seq = seq; + if (rand.OneIn(2)) { + auto type = types[rand.Uniform(types.size())]; + file1_kvs.push_back( + {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); + } + } + auto file1 = mock::MakeMockFile(file1_kvs); + for (int i = 1; i < 100; i++) { + SequenceNumber seq = last_seq + 1; + last_seq++; + if (rand.OneIn(2)) { + auto type = types[rand.Uniform(types.size())]; + file2_kvs.push_back( + {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); + } + } + auto file2 = mock::MakeMockFile(file2_kvs); + for (SequenceNumber i = 1; i < last_seq + 1; i++) { + if (rand.OneIn(5)) { + db_snapshots.push_back(i); + } + } + + const bool kVerify = true; + const int output_level_0 = 0; + NewDB(); + AddMockFile(file1); + AddMockFile(file2); + SetLastSequence(last_seq); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + // put the output on L0 since it is easier to feed them again to the 2nd + // compaction + RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, + output_level_0, !kVerify, &snapshot_fetcher); + + // Now db_snapshots are changed. Run the compaction again without snapshot + // fetcher but with the updated snapshot list. + compaction_job_stats_.Reset(); + files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, + output_level_0 + 1, !kVerify); + // The result should be what we get if we run compaction without snapshot + // fetcher on the updated list of snapshots + auto expected = mock_table_factory_->output(); + + NewDB(); + AddMockFile(file1); + AddMockFile(file2); + SetLastSequence(last_seq); + files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, + output_level_0, !kVerify); + // The 2nd compaction above would get rid of useless delete markers. To get + // the output here exactly as what we got above after two compactions, we also + // run the compaction for 2nd time. + compaction_job_stats_.Reset(); + files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, + output_level_0 + 1, !kVerify); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_impl.h b/db/db_impl.h index 9bdb0abdc10..623f69ba6ef 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -561,6 +561,13 @@ class DBImpl : public DB { const SnapshotList& snapshots() const { return snapshots_; } + void LoadSnapshots(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot, + const SequenceNumber& max_seq) const { + InstrumentedMutexLock l(mutex()); + snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); + } + const ImmutableDBOptions& immutable_db_options() const { return immutable_db_options_; } @@ -739,7 +746,7 @@ class DBImpl : public DB { // Not thread-safe. void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); - InstrumentedMutex* mutex() { return &mutex_; } + InstrumentedMutex* mutex() const { return &mutex_; } Status NewDB(); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 49b6c0fd804..1cdadf03942 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -798,6 +798,31 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, return s; } +namespace { +class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback { + public: + SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env, + uint64_t snap_refresh_nanos, Logger* info_log) + : SnapshotListFetchCallback(env, snap_refresh_nanos), + db_impl_(db_impl), + info_log_(info_log) {} + virtual void Refresh(std::vector* snapshots, + SequenceNumber max) override { + size_t prev = snapshots->size(); + snapshots->clear(); + db_impl_->LoadSnapshots(snapshots, nullptr, max); + size_t now = snapshots->size(); + ROCKS_LOG_DEBUG(info_log_, + "Compaction snapshot count refreshed from %zu to %zu", prev, + now); + } + + private: + DBImpl* db_impl_; + Logger* info_log_; +}; +} // namespace + Status DBImpl::CompactFiles(const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, @@ -969,6 +994,9 @@ Status DBImpl::CompactFilesImpl( assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJobStats compaction_job_stats; + SnapshotListFetchCallbackImpl fetch_callback( + this, env_, c->mutable_cf_options()->snap_refresh_nanos, + immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -978,7 +1006,9 @@ Status DBImpl::CompactFilesImpl( snapshot_checker, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, Env::Priority::USER); + &compaction_job_stats, Env::Priority::USER, + immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback + : nullptr); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already @@ -2622,6 +2652,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); assert(is_snapshot_supported_ || snapshots_.empty()); + SnapshotListFetchCallbackImpl fetch_callback( + this, env_, c->mutable_cf_options()->snap_refresh_nanos, + immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -2631,7 +2664,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, thread_pri); + &compaction_job_stats, thread_pri, + immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback + : nullptr); compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index f2610fd18b2..f1cf6f4b755 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -91,13 +91,23 @@ class SnapshotList { SequenceNumber* oldest_write_conflict_snapshot = nullptr, const SequenceNumber& max_seq = kMaxSequenceNumber) const { std::vector ret; + GetAll(&ret, oldest_write_conflict_snapshot, max_seq); + return ret; + } + + void GetAll(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector& ret = *snap_vector; + // So far we have no use case that would pass a non-empty vector + assert(ret.size() == 0); if (oldest_write_conflict_snapshot != nullptr) { *oldest_write_conflict_snapshot = kMaxSequenceNumber; } if (empty()) { - return ret; + return; } const SnapshotImpl* s = &list_; while (s->next_ != &list_) { @@ -119,7 +129,7 @@ class SnapshotList { s = s->next_; } - return ret; + return; } // get the sequence number of the most recent snapshot diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 4b34996a730..a0ae7ca7785 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -816,6 +816,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos( + rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, unsigned char); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 4cc2998b2d8..a1071f62ec7 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -269,6 +269,17 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; + // If non-zero, compactions will periodically refresh the snapshot list. The + // delay for the first refresh is snap_refresh_nanos nano seconds and + // exponentially increases afterwards. When having many short-lived snapshots, + // this option helps reducing the cpu usage of long-running compactions. The + // feature is disabled when max_subcompactions is greater than one. + // + // Default: 0.1s + // + // Dynamically changeable through SetOptions() API + uint64_t snap_refresh_nanos = 100 * 1000 * 1000; // 0.1s + // Disable automatic compactions. Manual compactions can still // be issued on this column family // diff --git a/options/cf_options.cc b/options/cf_options.cc index 78accaeb915..f7af3f834c9 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -169,6 +169,8 @@ void MutableCFOptions::Dump(Logger* log) const { target_file_size_multiplier); ROCKS_LOG_INFO(log, " max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); + ROCKS_LOG_INFO(log, " snap_refresh_nanos: %" PRIu64, + snap_refresh_nanos); ROCKS_LOG_INFO(log, " max_bytes_for_level_multiplier: %f", max_bytes_for_level_multiplier); ROCKS_LOG_INFO(log, " ttl: %" PRIu64, diff --git a/options/cf_options.h b/options/cf_options.h index d0c4390c36d..47fca58fa7d 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -149,6 +149,7 @@ struct MutableCFOptions { target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), + snap_refresh_nanos(options.snap_refresh_nanos), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), @@ -185,6 +186,7 @@ struct MutableCFOptions { target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), + snap_refresh_nanos(0), max_bytes_for_level_multiplier(0), ttl(0), periodic_compaction_seconds(0), @@ -236,6 +238,7 @@ struct MutableCFOptions { uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; + uint64_t snap_refresh_nanos; double max_bytes_for_level_multiplier; uint64_t ttl; uint64_t periodic_compaction_seconds; diff --git a/options/options.cc b/options/options.cc index 5e0c539afb5..057727e59fb 100644 --- a/options/options.cc +++ b/options/options.cc @@ -215,6 +215,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); + ROCKS_LOG_HEADER( + log, " Options.snap_refresh_nanos: %" PRIu64, + snap_refresh_nanos); ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d", level_compaction_dynamic_level_bytes); ROCKS_LOG_HEADER(log, " Options.max_bytes_for_level_multiplier: %f", @@ -490,6 +493,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( write_buffer_size = 2 << 20; target_file_size_base = 2 * 1048576; max_bytes_for_level_base = 10 * 1048576; + snap_refresh_nanos = 0; soft_pending_compaction_bytes_limit = 256 * 1048576; hard_pending_compaction_bytes_limit = 1073741824ul; diff --git a/options/options_helper.cc b/options/options_helper.cc index b7781ff6d25..a973bbfde51 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -177,6 +177,7 @@ ColumnFamilyOptions BuildColumnFamilyOptions( mutable_cf_options.target_file_size_multiplier; cf_opts.max_bytes_for_level_base = mutable_cf_options.max_bytes_for_level_base; + cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos; cf_opts.max_bytes_for_level_multiplier = mutable_cf_options.max_bytes_for_level_multiplier; cf_opts.ttl = mutable_cf_options.ttl; @@ -526,9 +527,9 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, opt_address)); case OptionType::kBlockBasedTableIndexShorteningMode: return ParseEnum( - block_base_table_index_shortening_mode_string_map, value, - reinterpret_cast( - opt_address)); + block_base_table_index_shortening_mode_string_map, value, + reinterpret_cast( + opt_address)); case OptionType::kEncodingType: return ParseEnum( encoding_type_string_map, value, @@ -1666,13 +1667,13 @@ std::unordered_map std::unordered_map OptionsHelper::block_base_table_index_shortening_mode_string_map = { - {"kNoShortening", - BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, - {"kShortenSeparators", - BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, - {"kShortenSeparatorsAndSuccessor", - BlockBasedTableOptions::IndexShorteningMode:: - kShortenSeparatorsAndSuccessor}}; + {"kNoShortening", + BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, + {"kShortenSeparators", + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, + {"kShortenSeparatorsAndSuccessor", + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor}}; std::unordered_map OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, @@ -1910,6 +1911,10 @@ std::unordered_map {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_base)}}, + {"snap_refresh_nanos", + {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, true, + offsetof(struct MutableCFOptions, snap_refresh_nanos)}}, {"max_bytes_for_level_multiplier", {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier), OptionType::kDouble, OptionVerificationType::kNormal, true, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 005b9d53a89..2d6cc11c02e 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -415,6 +415,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:" "kSnappyCompression;" "max_bytes_for_level_base=986;" + "snap_refresh_nanos=1000000000;" "bloom_locality=8016;" "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" diff --git a/options/options_test.cc b/options/options_test.cc index fbfee311b0a..ded336dd18d 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -74,6 +74,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"target_file_size_base", "12"}, {"target_file_size_multiplier", "13"}, {"max_bytes_for_level_base", "14"}, + {"snap_refresh_nanos", "1000000000"}, {"level_compaction_dynamic_level_bytes", "true"}, {"max_bytes_for_level_multiplier", "15.0"}, {"max_bytes_for_level_multiplier_additional", "16:17:18"}, @@ -183,6 +184,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast(12)); ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); + ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U); ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); diff --git a/table/mock_table.cc b/table/mock_table.cc index 65a43616969..9b250604803 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -21,6 +21,12 @@ const InternalKeyComparator icmp_(BytewiseComparator()); } // namespace +stl_wrappers::KVMap MakeMockFile( + std::vector> l) { + return stl_wrappers::KVMap(l.begin(), l.end(), + stl_wrappers::LessOfComparator(&icmp_)); +} + stl_wrappers::KVMap MakeMockFile( std::initializer_list> l) { return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); @@ -137,6 +143,14 @@ void MockTableFactory::AssertLatestFile( ParseInternalKey(Slice(key), &ikey); std::cout << ikey.DebugString(false) << " -> " << value << std::endl; } + std::cout << "Expected:" << std::endl; + for (const auto& kv : file_contents) { + ParsedInternalKey ikey; + std::string key, value; + std::tie(key, value) = kv; + ParseInternalKey(Slice(key), &ikey); + std::cout << ikey.DebugString(false) << " -> " << value << std::endl; + } FAIL(); } } diff --git a/table/mock_table.h b/table/mock_table.h index 2f123a963cd..5bca14644d8 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -28,6 +28,8 @@ namespace mock { stl_wrappers::KVMap MakeMockFile( std::initializer_list> l = {}); +stl_wrappers::KVMap MakeMockFile( + std::vector> l); struct MockTableFileSystem { port::Mutex mutex; @@ -184,6 +186,12 @@ class MockTableFactory : public TableFactory { // contents are equal to file_contents void AssertSingleFile(const stl_wrappers::KVMap& file_contents); void AssertLatestFile(const stl_wrappers::KVMap& file_contents); + stl_wrappers::KVMap output() { + assert(!file_system_.files.empty()); + auto latest = file_system_.files.end(); + --latest; + return latest->second; + } private: uint32_t GetAndWriteNextID(WritableFileWriter* file) const; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 62f72f2b5eb..6487562d8bb 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -343,8 +343,9 @@ def whitebox_crash_main(args, unknown_args): if additional_opts['kill_random_test'] is None and (retncode == 0): # we expect zero retncode if no kill option expected = True - elif additional_opts['kill_random_test'] is not None and retncode < 0: - # we expect negative retncode if kill option was given + elif additional_opts['kill_random_test'] is not None and retncode <= 0: + # When kill option is given, the test MIGHT kill itself. + # If it does, negative retncode is expected. Otherwise 0. expected = True if not expected: diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc index a1ebc8b9617..fe9efd1f092 100644 --- a/util/compaction_job_stats_impl.cc +++ b/util/compaction_job_stats_impl.cc @@ -40,6 +40,9 @@ void CompactionJobStats::Reset() { file_fsync_nanos = 0; file_prepare_write_nanos = 0; + smallest_output_key_prefix.clear(); + largest_output_key_prefix.clear(); + num_single_del_fallthru = 0; num_single_del_mismatch = 0; } From 930bfa575079a4f99cd1963df7a7f0b3f1b5691d Mon Sep 17 00:00:00 2001 From: anand76 Date: Mon, 6 May 2019 18:23:45 -0700 Subject: [PATCH 016/572] Disable MultiGet from db_stress (#5284) Summary: Disable it for now until we can get stress tests to pass consistently. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5284 Differential Revision: D15230727 Pulled By: anand1976 fbshipit-source-id: 239baacdb3c4cd4fb7c4447f7582b9042501d752 --- tools/db_crashtest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6487562d8bb..780c987e929 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -65,7 +65,7 @@ "writepercent": 35, "format_version": lambda: random.randint(2, 4), "index_block_restart_interval": lambda: random.choice(range(1, 16)), - "use_multiget" : lambda: random.randint(0, 1), + "use_multiget" : 0, } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' From eea1cad850c2e268b0bfde208a005e44289dea47 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Tue, 7 May 2019 20:17:48 -0700 Subject: [PATCH 017/572] avoid updating index type during iterator creation (#5288) Summary: Right now there is a potential race condition where two threads are created to iterate through the DB (https://gist.github.com/miasantreble/88f5798a397ee7cb8e7baff9db2d9e85). The problem is that in `BlockBasedTable::NewIndexIterator`, if both threads failed to find index_reader from block cache, they will call `CreateIndexReader->UpdateIndexType()` which creates a race to update `index_type` in the shared rep_ object. By checking the code, we realize the index type is always populated by `PrefetchIndexAndFilterBlocks` during the table `Open` call, so there is no need to update index type every time during iterator creation. This PR attempts to fix the race condition by removing the unnecessary call to `UpdateIndexType` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5288 Differential Revision: D15252509 Pulled By: miasantreble fbshipit-source-id: 6e3258652121d5c76d267f7ac457e15c5e84756e --- table/block_based_table_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index e39fd2a860d..514587d0b96 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -3178,7 +3178,7 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { Status BlockBasedTable::CreateIndexReader( FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter, int level) { - auto index_type_on_file = UpdateIndexType(); + auto index_type_on_file = rep_->index_type; auto file = rep_->file.get(); const InternalKeyComparator* icomparator = &rep_->internal_comparator; From bdba6c56dde69b25762b27a9f1f95f51f2ee4551 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Wed, 8 May 2019 10:56:38 -0700 Subject: [PATCH 018/572] add WAL replay in TryCatchUpWithPrimary (#5282) Summary: Previously in PR https://github.com/facebook/rocksdb/pull/5161 we have added the capability to do WAL tailing in `OpenAsSecondary`, in this PR we extend such feature to `TryCatchUpWithPrimary` which is useful for an secondary RocksDB instance to retrieve and apply the latest updates and refresh log readers if needed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5282 Differential Revision: D15261011 Pulled By: miasantreble fbshipit-source-id: a15c94471e8c3b3b1f7f47c3135db1126e936949 --- db/db_impl_secondary.cc | 86 +++++++++++++++++++++++++---------------- db/db_impl_secondary.h | 2 + db/db_secondary_test.cc | 6 +++ 3 files changed, 60 insertions(+), 34 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 90e979b4e58..007910ea5b4 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -59,40 +59,7 @@ Status DBImplSecondary::Recover( single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; - // Recover from all newer log files than the ones named in the - // descriptor. - std::vector filenames; - s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); - if (s.IsNotFound()) { - return Status::InvalidArgument("Failed to open wal_dir", - immutable_db_options_.wal_dir); - } else if (!s.ok()) { - return s; - } - - std::vector logs; - // if log_readers_ is non-empty, it means we have applied all logs with log - // numbers smaller than the smallest log in log_readers_, so there is no - // need to pass these logs to RecoverLogFiles - uint64_t log_number_min = 0; - if (log_readers_.size() > 0) { - log_number_min = log_readers_.begin()->first; - } - for (size_t i = 0; i < filenames.size(); i++) { - uint64_t number; - FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && - number >= log_number_min) { - logs.push_back(number); - } - } - - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - SequenceNumber next_sequence(kMaxSequenceNumber); - s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/); - } + s = FindAndRecoverLogFiles(); } // TODO: update options_file_number_ needed? @@ -100,6 +67,41 @@ Status DBImplSecondary::Recover( return s; } +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (log_readers_.size() > 0) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } + return s; +} + // try to find log reader using log_number from log_readers_ map, initialize // if it doesn't exist Status DBImplSecondary::MaybeInitLogReader( @@ -294,6 +296,18 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, return s; } +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles() { + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/); + } + return s; +} + Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { if (read_options.managed) { @@ -377,6 +391,7 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { assert(versions_.get() != nullptr); assert(manifest_reader_.get() != nullptr); Status s; + // read the manifest and apply new changes to the secondary instance std::unordered_set cfds_changed; InstrumentedMutexLock lock_guard(&mutex_); s = static_cast(versions_.get()) @@ -389,6 +404,9 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { } sv_context.Clean(); } + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + s = FindAndRecoverLogFiles(); return s; } diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 64c81432848..32dbae058b8 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -194,6 +194,8 @@ class DBImplSecondary : public DBImpl { using DBImpl::Recover; + Status FindAndRecoverLogFiles(); + Status FindNewLogNumbers(std::vector* logs); Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only) override; diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 47daf9fd8cc..60ea5ba8d5f 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -237,6 +237,12 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { }; verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); } TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { From 25d81e4577c30f1da7fe6631f4123a5897de4f98 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 9 May 2019 12:04:56 -0700 Subject: [PATCH 019/572] DBIter::Next() can skip user key checking if previous entry's seqnum is 0 (#5244) Summary: Right now, DBIter::Next() always checks whether an entry is for the same user key as the previous entry to see whether the key should be hidden to the user. However, if previous entry's sequence number is 0, the check is not needed because 0 is the oldest possible sequence number. We could extend it from seqnum 0 case to simply prev_seqno >= current_seqno. However, it is less robust with bug or unexpected situations, while the gain is relatively low. We can always extend it later when needed. In a readseq benchmark with full formed LSM-tree, number of key comparisons called is reduced from 2.981 to 2.165. readseq against a fully compacted DB, no key comparison is called. Performance in this benchmark didn't show obvious improvement, which is expected because key comparisons only takes small percentage of CPU. But it may show up to be more effective if users have an expensive customized comparator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5244 Differential Revision: D15067257 Pulled By: siying fbshipit-source-id: b7e1ef3ec4fa928cba509683d2b3246e35d270d9 --- HISTORY.md | 5 ++++- db/db_iter.cc | 29 +++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 65d64d23604..fb1db417ecf 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,9 +4,12 @@ * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. ### New Features -* Reduce binary search when iterator reseek into the same data block. * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +### Performance Improvements +* Reduce binary search when iterator reseek into the same data block. +* DBIter::Next() can skip user key checking if previous entry's seqnum is 0. + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/db_iter.cc b/db/db_iter.cc index 43a56af78c7..1d8ccf9adbd 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -133,6 +133,7 @@ class DBIter final: public Iterator { direction_(kForward), valid_(false), current_entry_is_merged_(false), + is_key_seqnum_zero_(false), prefix_same_as_start_(read_options.prefix_same_as_start), pin_thru_lifetime_(read_options.pin_data), total_order_seek_(read_options.total_order_seek), @@ -333,6 +334,10 @@ class DBIter final: public Iterator { Direction direction_; bool valid_; bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; const bool prefix_same_as_start_; // Means that we will pin all data blocks we read as long the Iterator // is not deleted, will be true if ReadOptions::pin_data is true @@ -381,6 +386,7 @@ void DBIter::Next() { num_internal_keys_skipped_ = 0; bool ok = true; if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; if (!ReverseToForward()) { ok = false; } @@ -400,6 +406,7 @@ void DBIter::Next() { FindNextUserEntry(true /* skipping the current user key */, prefix_same_as_start_); } else { + is_key_seqnum_zero_ = false; valid_ = false; } if (statistics_ != nullptr && valid_) { @@ -450,10 +457,16 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_blob_ = false; do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; return false; } + is_key_seqnum_zero_ = (ikey_.sequence == 0); + if (iterate_upper_bound_ != nullptr && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; @@ -470,11 +483,18 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) } if (IsVisible(ikey_.sequence)) { - if (skipping && user_comparator_.Compare(ikey_.user_key, - saved_key_.GetUserKey()) <= 0) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + if (!is_prev_key_seqnum_zero && skipping && + user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= + 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { + assert(!skipping || user_comparator_.Compare( + ikey_.user_key, saved_key_.GetUserKey()) > 0); num_skipped = 0; switch (ikey_.type) { case kTypeDeletion: @@ -595,6 +615,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // If we have sequentially iterated via numerous equal keys, then it's // better to seek so that we can avoid too many key comparisons. if (num_skipped > max_skip_ && CanReseekToSkip()) { + is_key_seqnum_zero_ = false; num_skipped = 0; std::string last_key; if (skipping) { @@ -1265,6 +1286,7 @@ void DBIter::Seek(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); + is_key_seqnum_zero_ = false; SequenceNumber seq = sequence_; saved_key_.Clear(); @@ -1323,6 +1345,7 @@ void DBIter::SeekForPrev(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); + is_key_seqnum_zero_ = false; saved_key_.Clear(); // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, 0 /* sequence_number */, @@ -1390,6 +1413,7 @@ void DBIter::SeekToFirst() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time); @@ -1442,6 +1466,7 @@ void DBIter::SeekToLast() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time); From 181bb43f08c77be7af72ceea12b9c66b8ab5fd7d Mon Sep 17 00:00:00 2001 From: anand76 Date: Thu, 9 May 2019 13:03:37 -0700 Subject: [PATCH 020/572] Fix bugs in FilePickerMultiGet (#5292) Summary: This PR fixes a couple of bugs in FilePickerMultiGet that were causing db_stress test failures. The failures were caused by - 1. Improper handling of a key that matches the user key portion of an L0 file's largest key. In this case, the curr_index_in_curr_level file index in L0 for that key was getting incremented, but batch_iter_ was not advanced. By design, all keys in a batch are supposed to be checked against an L0 file before advancing to the next L0 file. Not advancing to the next key in the batch was causing a double increment of curr_index_in_curr_level due to the same key being processed again 2. Improper handling of a key that matches the user key portion of the largest key in the last file of L1 and higher. This was resulting in a premature end to the processing of the batch for that level when the next key in the batch is a duplicate. Typically, the keys in MultiGet will not be duplicates, but its good to handle that case correctly Test - asan_crash make check Pull Request resolved: https://github.com/facebook/rocksdb/pull/5292 Differential Revision: D15282530 Pulled By: anand1976 fbshipit-source-id: d1a6a86e0af273169c3632db22a44d79c66a581f --- db/version_set.cc | 20 ++++++++++++++++++-- tools/db_crashtest.py | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 6d4fb7315ad..8463a5aa735 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -416,6 +416,18 @@ class FilePickerMultiGet { bool file_hit = false; int cmp_largest = -1; if (curr_file_index >= curr_file_level_->num_files) { + // In the unlikely case the next key is a duplicate of the current key, + // and the current key is the last in the level and the internal key + // was not found, we need to skip lookup for the remaining keys and + // reset the search bounds + if (batch_iter_ != current_level_range_.end()) { + ++batch_iter_; + for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + } return false; } // Loops over keys in the MultiGet batch until it finds a file with @@ -533,7 +545,10 @@ class FilePickerMultiGet { // any further for that key, so advance batch_iter_. Else, keep // batch_iter_ positioned on that key so we look it up again in // the next file - if (current_level_range_.CheckKeyDone(batch_iter_)) { + // For L0, always advance the key because we will look in the next + // file regardless for all keys not found yet + if (current_level_range_.CheckKeyDone(batch_iter_) || + curr_level_ == 0) { ++batch_iter_; } } @@ -601,7 +616,8 @@ class FilePickerMultiGet { unsigned int start_index_in_curr_level; FilePickerContext(int32_t left, int32_t right) - : search_left_bound(left), search_right_bound(right) {} + : search_left_bound(left), search_right_bound(right), + curr_index_in_curr_level(0), start_index_in_curr_level(0) {} FilePickerContext() = default; }; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 780c987e929..6487562d8bb 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -65,7 +65,7 @@ "writepercent": 35, "format_version": lambda: random.randint(2, 4), "index_block_restart_interval": lambda: random.choice(range(1, 16)), - "use_multiget" : 0, + "use_multiget" : lambda: random.randint(0, 1), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' From 9fad3e21eb90d215b6719097baba417bc1eeca3c Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 9 May 2019 14:15:12 -0700 Subject: [PATCH 021/572] Merging iterator to avoid child iterator reseek for some cases (#5286) Summary: When reseek happens in merging iterator, reseeking a child iterator can be avoided if: (1) the iterator represents imutable data (2) reseek() to a larger key than the current key (3) the current key of the child iterator is larger than the seek key because it is guaranteed that the result will fall into the same position. This optimization will be useful for use cases where users keep seeking to keys nearby in ascending order. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5286 Differential Revision: D15283635 Pulled By: siying fbshipit-source-id: 35f79ffd5ce3609146faa8cd55f2bfd733502f83 --- HISTORY.md | 1 + db/db_iterator_test.cc | 69 ++++++++++++++++++++++++++++++++ db/version_set.cc | 3 +- table/block_based_table_reader.h | 3 +- table/internal_iterator.h | 5 ++- table/iterator_wrapper.h | 7 +++- table/merging_iterator.cc | 19 ++++++++- 7 files changed, 101 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index fb1db417ecf..99235a33d5c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. +* Merging iterator to avoid child iterator reseek for some cases ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 78b387577dd..cc1af2e0ad8 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2548,6 +2548,75 @@ TEST_P(DBIteratorTest, AvoidReseekLevelIterator) { SyncPoint::GetInstance()->DisableProcessing(); } +TEST_P(DBIteratorTest, AvoidReseekChildIterator) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 800; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + std::string random_str = RandomString(&rnd, 180); + + ASSERT_OK(Put("1", random_str)); + ASSERT_OK(Put("2", random_str)); + ASSERT_OK(Put("3", random_str)); + ASSERT_OK(Put("4", random_str)); + ASSERT_OK(Put("8", random_str)); + ASSERT_OK(Put("9", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("5", random_str)); + ASSERT_OK(Put("6", random_str)); + ASSERT_OK(Put("7", random_str)); + ASSERT_OK(Flush()); + + // These two keys will be kept in memtable. + ASSERT_OK(Put("0", random_str)); + ASSERT_OK(Put("8", random_str)); + + int num_iter_wrapper_seek = 0; + SyncPoint::GetInstance()->SetCallBack( + "IteratorWrapper::Seek:0", + [&](void* /*arg*/) { num_iter_wrapper_seek++; }); + SyncPoint::GetInstance()->EnableProcessing(); + { + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek("1"); + ASSERT_TRUE(iter->Valid()); + // DBIter always wraps internal iterator with IteratorWrapper, + // and in merging iterator each child iterator will be wrapped + // with IteratorWrapper. + ASSERT_EQ(4, num_iter_wrapper_seek); + + // child position: 1 and 5 + num_iter_wrapper_seek = 0; + iter->Seek("2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_iter_wrapper_seek); + + // child position: 2 and 5 + num_iter_wrapper_seek = 0; + iter->Seek("6"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(4, num_iter_wrapper_seek); + + // child position: 8 and 6 + num_iter_wrapper_seek = 0; + iter->Seek("7"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_iter_wrapper_seek); + + // child position: 8 and 7 + num_iter_wrapper_seek = 0; + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(4, num_iter_wrapper_seek); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/version_set.cc b/db/version_set.cc index 8463a5aa735..84302556e66 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -896,7 +896,8 @@ class LevelIterator final : public InternalIterator { bool skip_filters, int level, RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr) - : table_cache_(table_cache), + : InternalIterator(false), + table_cache_(table_cache), read_options_(read_options), env_options_(env_options), icomparator_(icomparator), diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 1fcc8cbfa07..74d2caeb28b 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -590,7 +590,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool key_includes_seq = true, bool index_key_is_full = true, bool for_compaction = false) - : table_(table), + : InternalIteratorBase(false), + table_(table), read_options_(read_options), icomp_(icomp), user_comparator_(icomp.user_comparator()), diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 6b713e7b951..8f1cc9dd68e 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -20,7 +20,8 @@ class PinnedIteratorsManager; template class InternalIteratorBase : public Cleanable { public: - InternalIteratorBase() {} + InternalIteratorBase() : is_mutable_(true) {} + InternalIteratorBase(bool _is_mutable) : is_mutable_(_is_mutable) {} virtual ~InternalIteratorBase() {} // An iterator is either positioned at a key/value pair, or @@ -119,6 +120,7 @@ class InternalIteratorBase : public Cleanable { virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) { return Status::NotSupported(""); } + bool is_mutable() const { return is_mutable_; } protected: void SeekForPrevImpl(const Slice& target, const Comparator* cmp) { @@ -130,6 +132,7 @@ class InternalIteratorBase : public Cleanable { Prev(); } } + bool is_mutable_; private: // No copying allowed diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index fc5eb2613d8..a570e53c1e2 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -69,7 +69,12 @@ class IteratorWrapperBase { assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void Seek(const Slice& k) { + TEST_SYNC_POINT("IteratorWrapper::Seek:0"); + assert(iter_); + iter_->Seek(k); + Update(); + } void SeekForPrev(const Slice& k) { assert(iter_); iter_->SeekForPrev(k); diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index bd4a186b3c2..e5df6bdf6f0 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -127,14 +127,29 @@ class MergingIterator : public InternalIterator { } void Seek(const Slice& target) override { + bool is_increasing_reseek = false; + if (current_ != nullptr && direction_ == kForward && status_.ok() && + comparator_->Compare(target, key()) >= 0) { + is_increasing_reseek = true; + } ClearHeaps(); status_ = Status::OK(); for (auto& child : children_) { - { + // If upper bound never changes, we can skip Seek() for + // the !Valid() case too, but people do hack the code to change + // upper bound between Seek(), so it's not a good idea to break + // the API. + // If DBIter is used on top of merging iterator, we probably + // can skip mutable child iterators if they are invalid too, + // but it's a less clean API. We can optimize for it later if + // needed. + if (!is_increasing_reseek || !child.Valid() || + comparator_->Compare(target, child.key()) > 0 || + child.iter()->is_mutable()) { PERF_TIMER_GUARD(seek_child_seek_time); child.Seek(target); + PERF_COUNTER_ADD(seek_child_seek_count, 1); } - PERF_COUNTER_ADD(seek_child_seek_count, 1); if (child.Valid()) { assert(child.status().ok()); From 6451673f379319755ff238ffef18c674ce37bd0b Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Thu, 9 May 2019 18:16:45 -0700 Subject: [PATCH 022/572] Add C bindings for LowerThreadPoolIO/CPUPriority (#5285) Summary: There were no C bindings for lowering thread pool priority. This adds those. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5285 Differential Revision: D15290050 Pulled By: siying fbshipit-source-id: b2ed94d0c39d27434ace2204829a242b53d0d67a --- db/c.cc | 16 ++++++++++++++++ include/rocksdb/c.h | 5 +++++ 2 files changed, 21 insertions(+) diff --git a/db/c.cc b/db/c.cc index aac1cf4087c..58b51e2523e 100644 --- a/db/c.cc +++ b/db/c.cc @@ -3268,6 +3268,22 @@ void rocksdb_env_join_all_threads(rocksdb_env_t* env) { env->rep->WaitForJoin(); } +void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(Env::HIGH); +} + +void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(Env::HIGH); +} + void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index a0ae7ca7785..ed0709d22a1 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1301,6 +1301,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); + extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(); From f0bf3bf34b068a918b4969812553f21958f79ea6 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 10 May 2019 11:53:33 -0700 Subject: [PATCH 023/572] Turn CachableEntry into a proper resource handle (#5252) Summary: CachableEntry is used in a variety of contexts: it may refer to a cached object (i.e. an object in the block cache), an owned object, or an unowned object; also, in some cases (most notably with iterators), the responsibility of managing the pointed-to object gets handed off to another object. Each of the above scenarios have different implications for the lifecycle of the referenced object. For the most part, the patch does not change the lifecycle of managed objects; however, it makes these relationships explicit, and it also enables us to eliminate some hacks and accident-prone code around releasing cache handles and deleting/cleaning up objects. (The only places where the patch changes how an objects are managed are the partitions of partitioned indexes and filters.) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5252 Differential Revision: D15101358 Pulled By: ltamasi fbshipit-source-id: 9eb59e9ae5a7230e3345789762d0ba1f189485be --- table/block_based_table_reader.cc | 244 +++++++++++-------------- table/block_based_table_reader.h | 29 +-- table/cachable_entry.h | 219 ++++++++++++++++++++++ table/partitioned_filter_block.cc | 66 ++----- table/partitioned_filter_block.h | 13 +- table/partitioned_filter_block_test.cc | 3 +- 6 files changed, 351 insertions(+), 223 deletions(-) create mode 100644 table/cachable_entry.h diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 514587d0b96..1dc220ddec5 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -112,12 +112,6 @@ inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( : nullptr; } -// Delete the resource that is held by the iterator. -template -void DeleteHeldResource(void* arg, void* /*ignored*/) { - delete reinterpret_cast(arg); -} - // Delete the entry resided in the cache. template void DeleteCachedEntry(const Slice& /*key*/, void* value) { @@ -224,7 +218,7 @@ bool PrefixExtractorChanged(const TableProperties* table_properties, } // namespace // Index that allows binary search lookup in a two-level index structure. -class PartitionIndexReader : public IndexReader, public Cleanable { +class PartitionIndexReader : public IndexReader { public: // Read the partition index from the file and create an instance for // `PartitionIndexReader`. @@ -332,10 +326,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable { // After prefetch, read the partitions one by one biter.SeekToFirst(); auto ro = ReadOptions(); - Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { handle = biter.value(); - BlockBasedTable::CachableEntry block; + CachableEntry block; const bool is_index = true; // TODO: Support counter batch update for partitioned index and // filter blocks @@ -344,18 +337,12 @@ class PartitionIndexReader : public IndexReader, public Cleanable { UncompressionDict::GetEmptyDict(), &block, is_index, nullptr /* get_context */); - assert(s.ok() || block.value == nullptr); - if (s.ok() && block.value != nullptr) { - if (block.cache_handle != nullptr) { + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { if (pin) { - partition_map_[handle.offset()] = block; - RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { - block_cache->Release(block.cache_handle); + partition_map_[handle.offset()] = std::move(block); } - } else { - delete block.value; } } } @@ -391,8 +378,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { } BlockBasedTable* table_; std::unique_ptr index_block_; - std::unordered_map> - partition_map_; + std::unordered_map> partition_map_; const bool index_key_includes_seq_; const bool index_value_is_full_; }; @@ -1221,14 +1207,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // This is the first call to NewIndexIterator() since we're in Open(). // On success it should give us ownership of the `CachableEntry` by // populating `index_entry`. - assert(index_entry.value != nullptr); + assert(index_entry.GetValue() != nullptr); if (prefetch_all) { - index_entry.value->CacheDependencies(pin_all); + index_entry.GetValue()->CacheDependencies(pin_all); } if (pin_index) { rep->index_entry = std::move(index_entry); - } else { - index_entry.Release(table_options.block_cache.get()); } } } @@ -1236,17 +1220,15 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = new_table->GetFilter(rep->table_prefix_extractor.get()); - if (filter_entry.value != nullptr && prefetch_all) { - filter_entry.value->CacheDependencies( + if (filter_entry.GetValue() != nullptr && prefetch_all) { + filter_entry.GetValue()->CacheDependencies( pin_all, rep->table_prefix_extractor.get()); } // if pin_filter is true then save it in rep_->filter_entry; it will be // released in the destructor only, hence it will be pinned in the // cache while this reader is alive if (pin_filter) { - rep->filter_entry = filter_entry; - } else { - filter_entry.Release(table_options.block_cache.get()); + rep->filter_entry = std::move(filter_entry); } } } else { @@ -1369,10 +1351,13 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, Rep* rep, - const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, + const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index, GetContext* get_context) { + + assert(block); + assert(block->IsEmpty()); + Status s; BlockContents* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; @@ -1380,7 +1365,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // Lookup uncompressed cache first if (block_cache != nullptr) { - block->cache_handle = GetEntryFromCache( + auto cache_handle = GetEntryFromCache( block_cache, block_cache_key, rep->level, is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, @@ -1393,15 +1378,16 @@ Status BlockBasedTable::GetDataBlockFromCache( : &get_context->get_context_stats_.num_cache_data_hit) : nullptr, statistics, get_context); - if (block->cache_handle != nullptr) { - block->value = - reinterpret_cast(block_cache->Value(block->cache_handle)); + if (cache_handle != nullptr) { + block->SetCachedValue( + reinterpret_cast(block_cache->Value(cache_handle)), + block_cache, cache_handle); return s; } } // If not found, search from the compressed block cache. - assert(block->cache_handle == nullptr && block->value == nullptr); + assert(block->IsEmpty()); if (block_cache_compressed == nullptr) { return s; @@ -1435,20 +1421,25 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { - block->value = - new Block(std::move(contents), rep->get_global_seqno(is_index), - read_amp_bytes_per_bit, - statistics); // uncompressed block - if (block_cache != nullptr && block->value->own_bytes() && + std::unique_ptr block_holder( + new Block(std::move(contents), rep->get_global_seqno(is_index), + read_amp_bytes_per_bit, statistics)); // uncompressed block + + if (block_cache != nullptr && block_holder->own_bytes() && read_options.fill_cache) { - size_t charge = block->value->ApproximateMemoryUsage(); - s = block_cache->Insert(block_cache_key, block->value, charge, + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, &DeleteCachedEntry, - &(block->cache_handle)); + &cache_handle); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG if (s.ok()) { + assert(cache_handle != nullptr); + block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + if (get_context != nullptr) { get_context->get_context_stats_.num_cache_add++; get_context->get_context_stats_.num_cache_bytes_write += charge; @@ -1477,9 +1468,9 @@ Status BlockBasedTable::GetDataBlockFromCache( } } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - delete block->value; - block->value = nullptr; } + } else { + block->SetOwnedValue(block_holder.release()); } } @@ -1497,33 +1488,34 @@ Status BlockBasedTable::PutDataBlockToCache( const UncompressionDict& uncompression_dict, SequenceNumber seq_no, size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, bool is_index, Cache::Priority priority, GetContext* get_context) { + + assert(cached_block); + assert(cached_block->IsEmpty()); assert(raw_block_comp_type == kNoCompression || block_cache_compressed != nullptr); Status s; - // Retrieve the uncompressed contents into a new buffer - BlockContents uncompressed_block_contents; Statistics* statistics = ioptions.statistics; + + std::unique_ptr block_holder; if (raw_block_comp_type != kNoCompression) { + // Retrieve the uncompressed contents into a new buffer + BlockContents uncompressed_block_contents; UncompressionContext context(raw_block_comp_type); UncompressionInfo info(context, uncompression_dict, raw_block_comp_type); s = UncompressBlockContents(info, raw_block_contents->data.data(), raw_block_contents->data.size(), &uncompressed_block_contents, format_version, ioptions, memory_allocator); - } - if (!s.ok()) { - return s; - } + if (!s.ok()) { + return s; + } - if (raw_block_comp_type != kNoCompression) { - cached_block->value = new Block(std::move(uncompressed_block_contents), - seq_no, read_amp_bytes_per_bit, - statistics); // uncompressed block + block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no, + read_amp_bytes_per_bit, statistics)); } else { - cached_block->value = - new Block(std::move(*raw_block_contents), seq_no, - read_amp_bytes_per_bit, ioptions.statistics); + block_holder.reset(new Block(std::move(*raw_block_contents), seq_no, + read_amp_bytes_per_bit, statistics)); } // Insert compressed block into compressed block cache. @@ -1553,16 +1545,20 @@ Status BlockBasedTable::PutDataBlockToCache( } // insert into uncompressed block cache - if (block_cache != nullptr && cached_block->value->own_bytes()) { - size_t charge = cached_block->value->ApproximateMemoryUsage(); - s = block_cache->Insert(block_cache_key, cached_block->value, charge, + if (block_cache != nullptr && block_holder->own_bytes()) { + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, &DeleteCachedEntry, - &(cached_block->cache_handle), priority); + &cache_handle, priority); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG if (s.ok()) { - assert(cached_block->cache_handle != nullptr); + assert(cache_handle != nullptr); + cached_block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + if (get_context != nullptr) { get_context->get_context_stats_.num_cache_add++; get_context->get_context_stats_.num_cache_bytes_write += charge; @@ -1589,12 +1585,12 @@ Status BlockBasedTable::PutDataBlockToCache( } } assert(reinterpret_cast(block_cache->Value( - cached_block->cache_handle)) == cached_block->value); + cached_block->GetCacheHandle())) == cached_block->GetValue()); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - delete cached_block->value; - cached_block->value = nullptr; } + } else { + cached_block->SetOwnedValue(block_holder.release()); } return s; @@ -1668,7 +1664,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter( } } -BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( +CachableEntry BlockBasedTable::GetFilter( const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context) const { const BlockHandle& filter_blk_handle = rep_->filter_handle; @@ -1677,7 +1673,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( no_io, get_context, prefix_extractor); } -BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( +CachableEntry BlockBasedTable::GetFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, bool no_io, GetContext* get_context, const SliceTransform* prefix_extractor) const { @@ -1687,17 +1683,19 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // most probably fail again. if (!is_a_filter_partition && !rep_->table_options.cache_index_and_filter_blocks) { - return {rep_->filter.get(), nullptr /* cache handle */}; + return {rep_->filter.get(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */}; } Cache* block_cache = rep_->table_options.block_cache.get(); if (rep_->filter_policy == nullptr /* do not use filter */ || block_cache == nullptr /* no block cache at all */) { - return {nullptr /* filter */, nullptr /* cache handle */}; + return CachableEntry(); } - if (!is_a_filter_partition && rep_->filter_entry.IsSet()) { - return rep_->filter_entry; + if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { + return {rep_->filter_entry.GetValue(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */}; } PERF_TIMER_GUARD(read_filter_block_nanos); @@ -1708,7 +1706,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( filter_blk_handle, cache_key); Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = GetEntryFromCache( + Cache::Handle* cache_handle = GetEntryFromCache( block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, get_context ? &get_context->get_context_stats_.num_cache_filter_miss @@ -1757,20 +1755,22 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( } } - return {filter, cache_handle}; + return {filter, cache_handle ? block_cache : nullptr, cache_handle, + false /* own_value */}; } -BlockBasedTable::CachableEntry +CachableEntry BlockBasedTable::GetUncompressionDict(Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context) { if (!rep->table_options.cache_index_and_filter_blocks) { // block cache is either disabled or not used for meta-blocks. In either // case, BlockBasedTableReader is the owner of the uncompression dictionary. - return {rep->uncompression_dict.get(), nullptr /* cache handle */}; + return {rep->uncompression_dict.get(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */}; } if (rep->compression_dict_handle.IsNull()) { - return {nullptr, nullptr}; + return CachableEntry(); } char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto cache_key = @@ -1835,7 +1835,8 @@ BlockBasedTable::GetUncompressionDict(Rep* rep, assert(cache_handle == nullptr); } } - return {dict, cache_handle}; + return {dict, cache_handle ? rep->table_options.block_cache.get() : nullptr, + cache_handle, false /* own_value */}; } // disable_prefix_seek should be set to true when prefix_extractor found in SST @@ -1853,10 +1854,10 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( read_options.fill_cache); } // we have a pinned index block - if (rep_->index_entry.IsSet()) { + if (rep_->index_entry.IsCached()) { // We don't return pinned datat from index blocks, so no need // to set `block_contents_pinned`. - return rep_->index_entry.value->NewIterator( + return rep_->index_entry.GetValue()->NewIterator( input_iter, read_options.total_order_seek || disable_prefix_seek, read_options.fill_cache); } @@ -1948,7 +1949,8 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( // the caller would like to take ownership of the index block // don't call RegisterCleanup() in this case, the caller will take care of it if (index_entry != nullptr) { - *index_entry = {index_reader, cache_handle}; + *index_entry = {index_reader, block_cache, cache_handle, + false /* own_value */}; } else { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); } @@ -1976,9 +1978,9 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( auto uncompression_dict_storage = GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); const UncompressionDict& uncompression_dict = - uncompression_dict_storage.value == nullptr + uncompression_dict_storage.GetValue() == nullptr ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.value; + : *uncompression_dict_storage.GetValue(); if (s.ok()) { s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, uncompression_dict, &block, is_index, @@ -1991,7 +1993,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( iter = new TBlockIter; } // Didn't get any data from block caches. - if (s.ok() && block.value == nullptr) { + if (s.ok() && block.GetValue() == nullptr) { if (no_io) { // Could not read from block_cache and can't do IO iter->Invalidate(Status::Incomplete("no blocking io")); @@ -2012,16 +2014,15 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( GetMemoryAllocator(rep->table_options)); } if (s.ok()) { - block.value = block_value.release(); + block.SetOwnedValue(block_value.release()); } } // TODO(ajkr): also pin compression dictionary block when // `pin_l0_filter_and_index_blocks_in_cache == true`. - uncompression_dict_storage.Release(block_cache); } if (s.ok()) { - assert(block.value != nullptr); + assert(block.GetValue() != nullptr); const bool kTotalOrderSeek = true; // Block contents are pinned and it is still pinned after the iterator // is destroyed as long as cleanup functions are moved to another object, @@ -2031,16 +2032,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // not reading data from the original source, whether immortal or not. // Otherwise, the block is pinned iff the source is immortal. bool block_contents_pinned = - (block.cache_handle != nullptr || - (!block.value->own_bytes() && rep->immortal_table)); - iter = block.value->NewIterator( + (block.IsCached() || + (!block.GetValue()->own_bytes() && rep->immortal_table)); + iter = block.GetValue()->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, index_key_is_full, block_contents_pinned); - if (block.cache_handle != nullptr) { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { + if (!block.IsCached()) { if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { // insert a dummy record to block cache to track the memory usage Cache::Handle* cache_handle; @@ -2063,8 +2061,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( Slice unique_key = Slice(cache_key, static_cast(end - cache_key)); s = block_cache->Insert(unique_key, nullptr, - block.value->ApproximateMemoryUsage(), nullptr, - &cache_handle); + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); if (s.ok()) { if (cache_handle != nullptr) { iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, @@ -2072,10 +2070,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } } } - iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); } + + block.TransferTo(iter); } else { - assert(block.value == nullptr); + assert(block.GetValue() == nullptr); iter->Invalidate(s); } return iter; @@ -2122,7 +2121,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( // Can't find the block from the cache. If I/O is allowed, read from the // file. - if (block_entry->value == nullptr && !no_io && ro.fill_cache) { + if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { Statistics* statistics = rep->ioptions.statistics; bool do_decompress = block_cache_compressed == nullptr && rep->blocks_maybe_compressed; @@ -2159,7 +2158,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } } } - assert(s.ok() || block_entry->value == nullptr); + assert(s.ok() || block_entry->GetValue() == nullptr); return s; } @@ -2187,11 +2186,11 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( Cache* block_cache = rep->table_options.block_cache.get(); assert(block_cache); RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(block->second.cache_handle)); + block_cache->GetUsage(block->second.GetCacheHandle())); Statistics* kNullStats = nullptr; // We don't return pinned datat from index blocks, so no need // to set `block_contents_pinned`. - return block->second.value->NewIterator( + return block->second.GetValue()->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); } @@ -2239,7 +2238,7 @@ bool BlockBasedTable::PrefixMayMatch( // First, try check with full filter auto filter_entry = GetFilter(prefix_extractor); - FilterBlockReader* filter = filter_entry.value; + FilterBlockReader* filter = filter_entry.GetValue(); bool filter_checked = true; if (filter != nullptr) { if (!filter->IsBlockBased()) { @@ -2251,9 +2250,6 @@ bool BlockBasedTable::PrefixMayMatch( } else { // if prefix_extractor changed for block based filter, skip filter if (need_upper_bound_check) { - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } return true; } auto prefix = prefix_extractor->Transform(user_key); @@ -2317,12 +2313,6 @@ bool BlockBasedTable::PrefixMayMatch( } } - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } return may_match; } @@ -2734,7 +2724,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, read_options.read_tier == kBlockCacheTier, get_context); } - filter = filter_entry.value; + filter = filter_entry.GetValue(); // First check the full filter // If full filter not useful, Then go into each block @@ -2838,12 +2828,6 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } return s; } @@ -2864,7 +2848,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, read_options.read_tier == kBlockCacheTier, nullptr /*get_context*/); } - filter = filter_entry.value; + filter = filter_entry.GetValue(); // First check the full filter // If full filter not useful, Then go into each block @@ -2954,13 +2938,6 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, *(miter->s) = s; } } - - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } } Status BlockBasedTable::Prefetch(const Slice* const begin, @@ -3144,11 +3121,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */); } assert(s.ok()); - bool in_cache = block.value != nullptr; - if (in_cache) { - ReleaseCachedEntry(block_cache, block.cache_handle); - } - return in_cache; + return block.IsCached(); } BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { @@ -3494,9 +3467,6 @@ void BlockBasedTable::Close() { Cache* const cache = rep_->table_options.block_cache.get(); - rep_->filter_entry.Release(cache); - rep_->index_entry.Release(cache); - // cleanup index, filter, and compression dictionary blocks // to avoid accessing dangling pointers if (!rep_->table_options.no_block_cache) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 74d2caeb28b..385e50ab79f 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -25,6 +25,7 @@ #include "rocksdb/table.h" #include "table/block.h" #include "table/block_based_table_factory.h" +#include "table/cachable_entry.h" #include "table/filter_block.h" #include "table/format.h" #include "table/get_context.h" @@ -220,8 +221,6 @@ class BlockBasedTable : public TableReader { // The key retrieved are internal keys. Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); - template - struct CachableEntry; struct Rep; Rep* get_rep() { return rep_; } @@ -311,8 +310,7 @@ class BlockBasedTable : public TableReader { const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, Rep* rep, const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, - const UncompressionDict& uncompression_dict, + CachableEntry* block, const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index = false, GetContext* get_context = nullptr); @@ -446,29 +444,6 @@ class BlockBasedTable::PartitionedIndexIteratorState bool index_key_is_full_; }; -// CachableEntry represents the entries that *may* be fetched from block cache. -// field `value` is the item we want to get. -// field `cache_handle` is the cache handle to the block cache. If the value -// was not read from cache, `cache_handle` will be nullptr. -template -struct BlockBasedTable::CachableEntry { - CachableEntry(TValue* _value, Cache::Handle* _cache_handle) - : value(_value), cache_handle(_cache_handle) {} - CachableEntry() : CachableEntry(nullptr, nullptr) {} - void Release(Cache* cache, bool force_erase = false) { - if (cache_handle) { - cache->Release(cache_handle, force_erase); - value = nullptr; - cache_handle = nullptr; - } - } - bool IsSet() const { return cache_handle != nullptr; } - - TValue* value = nullptr; - // if the entry is from the cache, cache_handle will be populated. - Cache::Handle* cache_handle = nullptr; -}; - struct BlockBasedTable::Rep { Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, diff --git a/table/cachable_entry.h b/table/cachable_entry.h new file mode 100644 index 00000000000..5b5d16ef318 --- /dev/null +++ b/table/cachable_entry.h @@ -0,0 +1,219 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include "rocksdb/cache.h" +#include "rocksdb/cleanable.h" + +namespace rocksdb { + +// CachableEntry is a handle to an object that may or may not be in the block +// cache. It is used in a variety of ways: +// +// 1) It may refer to an object in the block cache. In this case, cache_ and +// cache_handle_ are not nullptr, and the cache handle has to be released when +// the CachableEntry is destroyed (the lifecycle of the cached object, on the +// other hand, is managed by the cache itself). +// 2) It may uniquely own the (non-cached) object it refers to (examples include +// a block read directly from file, or uncompressed blocks when there is a +// compressed block cache but no uncompressed block cache). In such cases, the +// object has to be destroyed when the CachableEntry is destroyed. +// 3) It may point to an object (cached or not) without owning it. In this case, +// no action is needed when the CachableEntry is destroyed. +// 4) Sometimes, management of a cached or owned object (see #1 and #2 above) +// is transferred to some other object. This is used for instance with iterators +// (where cleanup is performed using a chain of cleanup functions, +// see Cleanable). +// +// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not +// allowed); hence, this is a move-only type, where a move transfers the +// management responsibilities, and leaves the source object in an empty state. + +template +class CachableEntry { +public: + CachableEntry() = default; + + CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, + bool own_value) + : value_(value) + , cache_(cache) + , cache_handle_(cache_handle) + , own_value_(own_value) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + } + + CachableEntry(const CachableEntry&) = delete; + CachableEntry& operator=(const CachableEntry&) = delete; + + CachableEntry(CachableEntry&& rhs) + : value_(rhs.value_) + , cache_(rhs.cache_) + , cache_handle_(rhs.cache_handle_) + , own_value_(rhs.own_value_) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + } + + CachableEntry& operator=(CachableEntry&& rhs) { + if (UNLIKELY(this == &rhs)) { + return *this; + } + + ReleaseResource(); + + value_ = rhs.value_; + cache_ = rhs.cache_; + cache_handle_ = rhs.cache_handle_; + own_value_ = rhs.own_value_; + + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + + return *this; + } + + ~CachableEntry() { + ReleaseResource(); + } + + bool IsEmpty() const { + return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && + !own_value_; + } + + bool IsCached() const { + assert(!!cache_ == !!cache_handle_); + + return cache_handle_ != nullptr; + } + + T* GetValue() const { return value_; } + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return cache_handle_; } + bool GetOwnValue() const { return own_value_; } + + void Reset() { + ReleaseResource(); + ResetFields(); + } + + void TransferTo(Cleanable* cleanable) { + if (cleanable) { + if (cache_handle_ != nullptr) { + assert(cache_ != nullptr); + cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_); + } else if (own_value_) { + cleanable->RegisterCleanup(&DeleteValue, value_, nullptr); + } + } + + ResetFields(); + } + + void SetOwnedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && own_value_)) { + assert(cache_ == nullptr && cache_handle_ == nullptr); + return; + } + + Reset(); + + value_ = value; + own_value_ = true; + } + + void SetUnownedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && cache_ == nullptr && + cache_handle_ == nullptr && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + assert(!own_value_); + } + + void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { + assert(value != nullptr); + assert(cache != nullptr); + assert(cache_handle != nullptr); + + if (UNLIKELY(value_ == value && cache_ == cache && + cache_handle_ == cache_handle && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + cache_ = cache; + cache_handle_ = cache_handle; + assert(!own_value_); + } + +private: + void ReleaseResource() { + if (LIKELY(cache_handle_ != nullptr)) { + assert(cache_ != nullptr); + cache_->Release(cache_handle_); + } else if (own_value_) { + delete value_; + } + } + + void ResetFields() { + value_ = nullptr; + cache_ = nullptr; + cache_handle_ = nullptr; + own_value_ = false; + } + + static void ReleaseCacheHandle(void* arg1, void* arg2) { + Cache* const cache = static_cast(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast(arg2); + assert(cache_handle); + + cache->Release(cache_handle); + } + + static void DeleteValue(void* arg1, void* /* arg2 */) { + delete static_cast(arg1); + } + +private: + T* value_ = nullptr; + Cache* cache_ = nullptr; + Cache::Handle* cache_handle_ = nullptr; + bool own_value_ = false; +}; + +} // namespace rocksdb diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index aab0f5509b9..3ccc7946393 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -176,24 +176,14 @@ bool PartitionedFilterBlockReader::KeyMayMatch( if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range return false; } - bool cached = false; auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - &cached, prefix_extractor); - if (UNLIKELY(!filter_partition.value)) { + prefix_extractor); + if (UNLIKELY(!filter_partition.GetValue())) { return true; } - auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor, - block_offset, no_io); - if (cached) { - return res; - } - if (LIKELY(filter_partition.IsSet())) { - filter_partition.Release(table_->rep_->table_options.block_cache.get()); - } else { - delete filter_partition.value; - } - return res; + return filter_partition.GetValue()->KeyMayMatch(key, prefix_extractor, + block_offset, no_io); } bool PartitionedFilterBlockReader::PrefixMayMatch( @@ -215,24 +205,14 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range return false; } - bool cached = false; auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - &cached, prefix_extractor); - if (UNLIKELY(!filter_partition.value)) { + prefix_extractor); + if (UNLIKELY(!filter_partition.GetValue())) { return true; } - auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor, - kNotValid, no_io); - if (cached) { - return res; - } - if (LIKELY(filter_partition.IsSet())) { - filter_partition.Release(table_->rep_->table_options.block_cache.get()); - } else { - delete filter_partition.value; - } - return res; + return filter_partition.GetValue()->PrefixMayMatch(prefix, prefix_extractor, + kNotValid, no_io); } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( @@ -251,10 +231,10 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( return fltr_blk_handle; } -BlockBasedTable::CachableEntry +CachableEntry PartitionedFilterBlockReader::GetFilterPartition( FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, - const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { + const bool no_io, const SliceTransform* prefix_extractor) { const bool is_a_filter_partition = true; auto block_cache = table_->rep_->table_options.block_cache.get(); if (LIKELY(block_cache != nullptr)) { @@ -267,9 +247,9 @@ PartitionedFilterBlockReader::GetFilterPartition( RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT); RecordTick(statistics(), BLOCK_CACHE_HIT); RecordTick(statistics(), BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(iter->second.cache_handle)); - *cached = true; - return iter->second; + block_cache->GetUsage(iter->second.GetCacheHandle())); + return {iter->second.GetValue(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */}; } } return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle, @@ -278,7 +258,8 @@ PartitionedFilterBlockReader::GetFilterPartition( } else { auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, is_a_filter_partition, prefix_extractor); - return {filter, nullptr}; + return {filter, nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */}; } } @@ -293,18 +274,10 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { // TODO(myabandeh): better estimation for filter_map_ size } -// Release the cached entry and decrement its ref count. -void ReleaseFilterCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - // TODO(myabandeh): merge this with the same function in IndexReader void PartitionedFilterBlockReader::CacheDependencies( bool pin, const SliceTransform* prefix_extractor) { // Before read partitions, prefetch them to avoid lots of IOs - auto rep = table_->rep_; IndexBlockIter biter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( @@ -330,7 +303,6 @@ void PartitionedFilterBlockReader::CacheDependencies( // After prefetch, read the partitions one by one biter.SeekToFirst(); - Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { handle = biter.value(); const bool no_io = true; @@ -338,16 +310,10 @@ void PartitionedFilterBlockReader::CacheDependencies( auto filter = table_->GetFilter( prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, /* get_context */ nullptr, prefix_extractor); - if (LIKELY(filter.IsSet())) { + if (LIKELY(filter.IsCached())) { if (pin) { filter_map_[handle.offset()] = std::move(filter); - RegisterCleanup(&ReleaseFilterCachedEntry, block_cache, - filter.cache_handle); - } else { - block_cache->Release(filter.cache_handle); } - } else { - delete filter.value; } } } diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h index 5d55da54493..2563dd2bf35 100644 --- a/table/partitioned_filter_block.h +++ b/table/partitioned_filter_block.h @@ -15,6 +15,7 @@ #include "table/block.h" #include "table/block_based_table_reader.h" +#include "table/cachable_entry.h" #include "table/full_filter_block.h" #include "table/index_builder.h" #include "util/autovector.h" @@ -69,8 +70,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { BlockHandle last_encoded_handle_; }; -class PartitionedFilterBlockReader : public FilterBlockReader, - public Cleanable { +class PartitionedFilterBlockReader : public FilterBlockReader { public: explicit PartitionedFilterBlockReader( const SliceTransform* prefix_extractor, bool whole_key_filtering, @@ -93,10 +93,9 @@ class PartitionedFilterBlockReader : public FilterBlockReader, private: BlockHandle GetFilterPartitionHandle(const Slice& entry); - BlockBasedTable::CachableEntry GetFilterPartition( + CachableEntry GetFilterPartition( FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, - const bool no_io, bool* cached, - const SliceTransform* prefix_extractor = nullptr); + const bool no_io, const SliceTransform* prefix_extractor = nullptr); virtual void CacheDependencies( bool bin, const SliceTransform* prefix_extractor) override; @@ -106,9 +105,7 @@ class PartitionedFilterBlockReader : public FilterBlockReader, const BlockBasedTable* table_; const bool index_key_includes_seq_; const bool index_value_is_full_; - std::unordered_map> - filter_map_; + std::unordered_map> filter_map_; }; } // namespace rocksdb diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index 8068f14d815..8afa530d71a 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -35,7 +35,8 @@ class MockedBlockBasedTable : public BlockBasedTable { auto obj = new FullFilterBlockReader( prefix_extractor, true, BlockContents(slice), rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return {obj, nullptr}; + return {obj, nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */}; } FilterBlockReader* ReadFilter( From 6a6aef25c1f20f5922e1478999fe0e7f59af1712 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Fri, 10 May 2019 12:36:40 -0700 Subject: [PATCH 024/572] Fix crash in BlockBasedTableIterator::Seek() (#5291) Summary: https://github.com/facebook/rocksdb/pull/5256 broke it: `block_iter_.user_key()` may not be valid even if `block_iter_points_to_real_block_` is true. E.g. if there was an IO error or Status::Incomplete. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5291 Differential Revision: D15273324 Pulled By: al13n321 fbshipit-source-id: 442e5b09f9884a58f92a6ac1ca93af719c219886 --- table/block_based_table_reader.cc | 2 +- table/table_test.cc | 38 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 1dc220ddec5..576117f0d35 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2325,7 +2325,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) { } bool need_seek_index = true; - if (block_iter_points_to_real_block_) { + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { // Reseek. prev_index_value_ = index_iter_->value(); // We can avoid an index seek if: diff --git a/table/table_test.cc b/table/table_test.cc index a62ce4255e3..7292ad7c32d 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1798,6 +1798,44 @@ TEST_P(BlockBasedTableTest, PartitionIndexTest) { } } +TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator()); + AddInternalKey(&c, "pika"); + + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(1, keys.size()); + + auto reader = c.GetTableReader(); + ReadOptions ropt; + ropt.read_tier = ReadTier::kBlockCacheTier; + std::unique_ptr iter( + reader->NewIterator(ropt, /* prefix_extractor */ nullptr)); + + auto ikey = [](Slice user_key) { + return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + }; + + iter->Seek(ikey("pika")); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); + + // This used to crash at some point. + iter->Seek(ikey("pika")); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); +} + // It's very hard to figure out the index block size of a block accurately. // To make sure we get the index size, we just make sure as key number // grows, the filter block size also grows. From e62601654535cdf9af46c99455af8da969efde65 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 10 May 2019 17:53:41 -0700 Subject: [PATCH 025/572] Fix a race condition caused by unlocking db mutex (#5294) Summary: Previous code may call `~ColumnFamilyData` in `DBImpl::AtomicFlushMemTablesToOutputFiles` if the column family is dropped or `cfd->IsFlushPending() == false`. In `~ColumnFamilyData`, the db mutex is released briefly and re-acquired. This can cause correctness issue. The reason is as follows. Assume there are more bg flush threads. After bg_flush_thr1 releases the db mutex, bg_flush_thr2 can grab it and pop an element from the flush queue. This will cause bg_flush_thr2 to accidentally pick some memtables which should have been picked by bg_flush_thr1. To make the matter worse, bg_flush_thr2 can clear `flush_requested_` flag for the memtable list, causing a subsequent call to `MemTableList::IsFlushPending()` by bg_flush_thr1 to return false, which is wrong. The fix is to delay `ColumnFamilyData::Unref` and `~ColumnFamilyData` for column families not selected for flush until `AtomicFlushMemTablesToOutputFiles` returns. Furthermore, a bg flush thread should not clear `MemTableList::flush_requested_` in `MemTableList::PickMemtablesToFlush` unless atomic flush is not used **or** the memtable list does not have unpicked memtables. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5294 Differential Revision: D15295297 Pulled By: riversand963 fbshipit-source-id: 03b101205ca22c242647cbf488bcf0ed80b2ecbd --- HISTORY.md | 3 +++ db/db_flush_test.cc | 31 +++++++++++++++++++++++++++++++ db/db_impl_compaction_flush.cc | 10 +++++++--- db/memtable_list.cc | 8 +++++++- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 99235a33d5c..23d8717f361 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,9 @@ * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases +### Bug Fixes +* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 09c461f8da4..c603f60b460 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -514,6 +514,37 @@ TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) { ASSERT_EQ("value", Get(0, "key")); } +TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) { + bool atomic_flush = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.max_write_buffer_number = 4; + // Set min_write_buffer_number_to_merge to be greater than 1, so that + // a column family with one memtable in the imm will not cause IsFlushPending + // to return true when flush_requested_ is false. + options.min_write_buffer_number_to_merge = 2; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + ASSERT_OK(Put(0, "key00", "value00")); + ASSERT_OK(Put(1, "key10", "value10")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + ASSERT_OK(Put(0, "key01", "value01")); + // Since max_write_buffer_number is 4, the following flush won't cause write + // stall. + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + delete handles_[0]; + handles_.clear(); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 1cdadf03942..3fbf24e49f8 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -2082,6 +2082,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, autovector bg_flush_args; std::vector& superversion_contexts = job_context->superversion_contexts; + autovector column_families_not_to_flush; while (!flush_queue_.empty()) { // This cfd is already referenced const FlushRequest& flush_req = PopFirstFromFlushQueue(); @@ -2092,9 +2093,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, ColumnFamilyData* cfd = iter.first; if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { // can't flush this CF, try next one - if (cfd->Unref()) { - delete cfd; - } + column_families_not_to_flush.push_back(cfd); continue; } superversion_contexts.emplace_back(SuperVersionContext(true)); @@ -2133,6 +2132,11 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, } } } + for (auto cfd : column_families_not_to_flush) { + if (cfd->Unref()) { + delete cfd; + } + } return status; } diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 5abe59b3632..69beb77f965 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -277,8 +277,12 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); const auto& memlist = current_->memlist_; + bool atomic_flush = false; for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) { break; } @@ -292,7 +296,9 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, ret->push_back(m); } } - flush_requested_ = false; // start-flush request is complete + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } } void MemTableList::RollbackMemtableFlush(const autovector& mems, From 92c60547fe1bc3254a18c2ff82e5398339cdb45b Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Mon, 13 May 2019 11:26:34 -0700 Subject: [PATCH 026/572] db_bench: fix hang on IO error (#5300) Summary: db_bench will wait indefinitely if there's background error. Fix by pass `abs_time_us` to cond var. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5300 Differential Revision: D15319945 Pulled By: miasantreble fbshipit-source-id: 0034fb7f6ec7c3303c4ccf26e54c20fbdac8ab44 --- tools/db_bench_tool.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index f0f1d879b96..b2562f4e539 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2102,10 +2102,10 @@ class Benchmark { cv_.SignalAll(); } - bool WaitForRecovery(uint64_t /*abs_time_us*/) { + bool WaitForRecovery(uint64_t abs_time_us) { InstrumentedMutexLock l(&mutex_); if (!recovery_complete_) { - cv_.Wait(/*abs_time_us*/); + cv_.TimedWait(abs_time_us); } if (recovery_complete_) { recovery_complete_ = false; From f383641a1d772bcde6dc42f26d798c0d93311443 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 13 May 2019 17:43:47 -0700 Subject: [PATCH 027/572] Unordered Writes (#5218) Summary: Performing unordered writes in rocksdb when unordered_write option is set to true. When enabled the writes to memtable are done without joining any write thread. This offers much higher write throughput since the upcoming writes would not have to wait for the slowest memtable write to finish. The tradeoff is that the writes visible to a snapshot might change over time. If the application cannot tolerate that, it should implement its own mechanisms to work around that. Using TransactionDB with WRITE_PREPARED write policy is one way to achieve that. Doing so increases the max throughput by 2.2x without however compromising the snapshot guarantees. The patch is prepared based on an original by siying Existing unit tests are extended to include unordered_write option. Benchmark Results: ``` TEST_TMPDIR=/dev/shm/ ./db_bench_unordered --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --unordered_write=1 ``` With WAL - Vanilla RocksDB: 78.6 MB/s - WRITER_PREPARED with unordered_write: 177.8 MB/s (2.2x) - unordered_write: 368.9 MB/s (4.7x with relaxed snapshot guarantees) Without WAL - Vanilla RocksDB: 111.3 MB/s - WRITER_PREPARED with unordered_write: 259.3 MB/s MB/s (2.3x) - unordered_write: 645.6 MB/s (5.8x with relaxed snapshot guarantees) - WRITER_PREPARED with unordered_write disable concurrency control: 185.3 MB/s MB/s (2.35x) Limitations: - The feature is not yet extended to `max_successive_merges` > 0. The feature is also incompatible with `enable_pipelined_write` = true as well as with `allow_concurrent_memtable_write` = false. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5218 Differential Revision: D15219029 Pulled By: maysamyabandeh fbshipit-source-id: 38f2abc4af8780148c6128acdba2b3227bc81759 --- HISTORY.md | 1 + db/c.cc | 5 + db/db_bloom_filter_test.cc | 2 + db/db_impl.h | 58 ++++- db/db_impl_open.cc | 11 + db/db_impl_write.cc | 157 ++++++++++-- db/db_memtable_test.cc | 69 +++++ db/db_test_util.cc | 8 + db/db_test_util.h | 6 + db/flush_scheduler.h | 3 + db/plain_table_db_test.cc | 1 + db/write_batch.cc | 9 +- db/write_callback_test.cc | 17 +- include/rocksdb/c.h | 2 + include/rocksdb/options.h | 25 ++ options/db_options.cc | 3 + options/db_options.h | 1 + options/options_helper.cc | 4 + options/options_settable_test.cc | 1 + table/block_based_table_factory.cc | 8 +- tools/db_bench_tool.cc | 4 + .../pessimistic_transaction_db.cc | 16 +- utilities/transactions/transaction_test.cc | 61 +++-- utilities/transactions/transaction_test.h | 27 +- .../write_prepared_transaction_test.cc | 237 ++++++++++++------ .../write_unprepared_transaction_test.cc | 3 +- 26 files changed, 585 insertions(+), 154 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 23d8717f361..919dea21133 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. diff --git a/db/c.cc b/db/c.cc index 58b51e2523e..8f96366fbed 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2473,6 +2473,11 @@ void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, opt->rep.enable_pipelined_write = v; } +void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.unordered_write = v; +} + void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, uint32_t n) { opt->rep.max_subcompactions = n; diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index a2a01d6b4cf..beed590ae66 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1095,6 +1095,8 @@ TEST_F(DBBloomFilterTest, PrefixScan) { options.max_background_compactions = 2; options.create_if_missing = true; options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false options.allow_concurrent_memtable_write = false; BlockBasedTableOptions table_options; diff --git a/db/db_impl.h b/db/db_impl.h index 623f69ba6ef..0ee5d82b56c 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -897,14 +897,32 @@ class DBImpl : public DB { bool disable_memtable = false, uint64_t* seq_used = nullptr); - // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates - // the number of sub-patches. A sub-patch is a subset of the write batch that - // does not have duplicate keys. - Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates, - WriteCallback* callback = nullptr, - uint64_t* log_used = nullptr, uint64_t log_ref = 0, - uint64_t* seq_used = nullptr, size_t batch_cnt = 0, - PreReleaseCallback* pre_release_callback = nullptr); + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); // write cached_recoverable_state_ to memtable if it is not empty // The writer must be the leader in write_thread_ and holding mutex_ @@ -1121,6 +1139,20 @@ class DBImpl : public DB { const autovector& flush_memtable_ids, bool resuming_from_bg_err); + inline void WaitForPendingWrites() { + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + // REQUIRES: mutex locked and in write thread. void AssignAtomicFlushSeq(const autovector& cfds); @@ -1571,13 +1603,21 @@ class DBImpl : public DB { // corresponding call to PurgeObsoleteFiles has not yet finished. int pending_purge_obsolete_files_; - // last time when DeleteObsoleteFiles with full scan was executed. Originaly + // last time when DeleteObsoleteFiles with full scan was executed. Originally // initialized with startup time. uint64_t delete_obsolete_files_last_run_; // last time stats were dumped to LOG std::atomic last_stats_dump_time_microsec_; + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + // Each flush or compaction gets its own job id. this counter makes sure // they're unique std::atomic next_job_id_; diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 1bc69b49182..66104d0ba28 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -228,6 +228,17 @@ static Status ValidateOptions( return Status::InvalidArgument("keep_log_file_num must be greater than 0"); } + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with !allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + return Status::OK(); } } // namespace diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 3edec9ac521..733eb408a8d 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -94,6 +94,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "pipelined_writes is not compatible with seq_per_batch"); } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } // Otherwise IsLatestPersistentState optimization does not make sense assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || disable_memtable); @@ -107,8 +112,39 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (two_write_queues_ && disable_memtable) { - return WriteImplWALOnly(write_options, my_batch, callback, log_used, - log_ref, seq_used, batch_cnt, pre_release_callback); + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback, + log_used, log_ref, &seq, sub_batch_cnt, + pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; } if (immutable_db_options_.enable_pipelined_write) { @@ -534,23 +570,65 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, return w.FinalStatus(); } +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/, seq_per_batch_, sub_batch_cnt); + + WriteStatusCheck(w.status); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + switch_cv_.notify_all(); + } + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + // The 2nd write queue. If enabled it will be used only for WAL-only writes. // This is the only queue that updates LastPublishedSequence which is only // applicable in a two-queue setting. -Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, - WriteBatch* my_batch, WriteCallback* callback, - uint64_t* log_used, uint64_t log_ref, - uint64_t* seq_used, size_t batch_cnt, - PreReleaseCallback* pre_release_callback) { +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { Status status; PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, - true /* disable_memtable */, batch_cnt, - pre_release_callback); + disable_memtable, sub_batch_cnt, pre_release_callback); RecordTick(stats_, WRITE_WITH_WAL); StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); - nonmem_write_thread_.JoinBatchGroup(&w); + write_thread->JoinBatchGroup(&w); assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); if (w.state == WriteThread::STATE_COMPLETED) { if (log_used != nullptr) { @@ -563,9 +641,33 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + InstrumentedMutexLock l(&mutex_); + bool need_log_sync = false; + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + WriteStatusCheck(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + WriteThread::WriteGroup write_group; uint64_t last_sequence; - nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + write_thread->EnterAsBatchGroupLeader(&w, &write_group); // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only @@ -602,11 +704,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // LastAllocatedSequence is increased inside WriteToWAL under // wal_write_mutex_ to ensure ordered events in WAL size_t seq_inc = 0 /* total_count */; - if (seq_per_batch_) { + if (assign_order == kDoAssignOrder) { size_t total_batch_cnt = 0; for (auto* writer : write_group) { - assert(writer->batch_cnt); - total_batch_cnt += writer->batch_cnt; + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } } seq_inc = total_batch_cnt; } @@ -617,16 +721,21 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // Otherwise we inc seq number to do solely the seq allocation last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); } + + size_t memtable_write_cnt = 0; auto curr_seq = last_sequence + 1; for (auto* writer : write_group) { if (writer->CallbackFailed()) { continue; } writer->sequence = curr_seq; - if (seq_per_batch_) { - assert(writer->batch_cnt); + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); curr_seq += writer->batch_cnt; } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } // else seq advances only by memtable writes } if (status.ok() && write_options.sync) { @@ -648,9 +757,8 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, for (auto* writer : write_group) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); - const bool DISABLE_MEMTABLE = true; Status ws = writer->pre_release_callback->Callback( - writer->sequence, DISABLE_MEMTABLE, writer->log_used); + writer->sequence, disable_memtable, writer->log_used); if (!ws.ok()) { status = ws; break; @@ -658,7 +766,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, } } } - nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status); + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); if (status.ok()) { status = w.FinalStatus(); } @@ -710,6 +826,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); if (UNLIKELY(status.ok() && !single_column_family_mode_ && total_log_size_ > GetMaxTotalWalSize())) { + WaitForPendingWrites(); status = SwitchWAL(write_context); } @@ -719,10 +836,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // thread is writing to another DB with the same write buffer, they may also // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. + WaitForPendingWrites(); status = HandleWriteBufferFull(write_context); } if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + WaitForPendingWrites(); status = ScheduleFlushes(write_context); } diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 294d0f581bc..a212c981286 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -204,6 +204,75 @@ TEST_F(DBMemTableTest, DuplicateSeq) { delete mem; } +// A simple test to verify that the concurrent merge writes is functional +TEST_F(DBMemTableTest, ConcurrentMergeWrite) { + int num_ops = 1000; + std::string value; + Status s; + MergeContext merge_context; + Options options; + // A merge operator that is not sensitive to concurrent writes since in this + // test we don't order the writes. + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.allow_concurrent_memtable_write = true; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTablePostProcessInfo post_process_info; + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Put 0 as the base + PutFixed64(&value, static_cast(0)); + bool res = mem->Add(0, kTypeValue, "key", value); + ASSERT_TRUE(res); + value.clear(); + + // Write Merge concurrently + rocksdb::port::Thread write_thread1([&]() { + std::string v1; + for (int seq = 1; seq < num_ops / 2; seq++) { + PutFixed64(&v1, seq); + bool res1 = + mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info); + ASSERT_TRUE(res1); + v1.clear(); + } + }); + rocksdb::port::Thread write_thread2([&]() { + std::string v2; + for (int seq = num_ops / 2; seq < num_ops; seq++) { + PutFixed64(&v2, seq); + bool res2 = + mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info); + ASSERT_TRUE(res2); + v2.clear(); + } + }); + write_thread1.join(); + write_thread2.join(); + + Status status; + ReadOptions roptions; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey("key", kMaxSequenceNumber); + res = mem->Get(lkey, &value, &status, &merge_context, + &max_covering_tombstone_seq, roptions); + ASSERT_TRUE(res); + uint64_t ivalue = DecodeFixed64(Slice(value).data()); + uint64_t sum = 0; + for (int seq = 0; seq < num_ops; seq++) { + sum += seq; + } + ASSERT_EQ(ivalue, sum); + + delete mem; +} + TEST_F(DBMemTableTest, InsertWithHint) { Options options; options.allow_concurrent_memtable_write = false; diff --git a/db/db_test_util.cc b/db/db_test_util.cc index bee6b81d5dd..ebfc7a9cad3 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -341,6 +341,7 @@ Options DBTestBase::GetOptions( options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.memtable_factory.reset(NewHashSkipListRepFactory(16)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kPlainTableFirstBytePrefix: options.table_factory.reset(new PlainTableFactory()); @@ -373,12 +374,14 @@ Options DBTestBase::GetOptions( case kVectorRep: options.memtable_factory.reset(new VectorRepFactory(100)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kHashLinkList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.memtable_factory.reset( NewHashLinkListRepFactory(4, 0, 3, true, 4)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kDirectIO: { options.use_direct_reads = true; @@ -540,6 +543,11 @@ Options DBTestBase::GetOptions( options.manual_wal_flush = true; break; } + case kUnorderedWrite: { + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + } default: break; diff --git a/db/db_test_util.h b/db/db_test_util.h index 50109e0a406..f5d7fd1a75f 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -140,6 +140,11 @@ class SpecialMemTableRep : public MemTableRep { memtable_->Insert(handle); } + void InsertConcurrently(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + // Returns true iff an entry that compares equal to key is in the list. virtual bool Contains(const char* key) const override { return memtable_->Contains(key); @@ -688,6 +693,7 @@ class DBTestBase : public testing::Test { kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, kxxHash64Checksum, + kUnorderedWrite, // This must be the last line kEnd, }; diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h index cd3575861a8..b5abec40569 100644 --- a/db/flush_scheduler.h +++ b/db/flush_scheduler.h @@ -28,6 +28,9 @@ class FlushScheduler { // Filters column families that have been dropped. ColumnFamilyData* TakeNextColumnFamily(); + // This can be called concurrently with ScheduleFlush but it would miss all + // the scheduled flushes after the last synchronization. This would result + // into less precise enforcement of memtable sizes but should not matter much. bool Empty(); void Clear(); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 2dd0cff0b41..8a08cf9fede 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -142,6 +142,7 @@ class PlainTableDBTest : public testing::Test, options.prefix_extractor.reset(NewFixedPrefixTransform(8)); options.allow_mmap_reads = mmap_mode_; options.allow_concurrent_memtable_write = false; + options.unordered_write = false; return options; } diff --git a/db/write_batch.cc b/db/write_batch.cc index 939b595305b..830fbeab15d 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -1471,7 +1471,6 @@ class MemTableInserter : public WriteBatch::Handler { Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - assert(!concurrent_memtable_writes_); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); @@ -1498,6 +1497,8 @@ class MemTableInserter : public WriteBatch::Handler { MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); bool perform_merge = false; + assert(!concurrent_memtable_writes_ || + moptions->max_successive_merges == 0); // If we pass DB through and options.max_successive_merges is hit // during recovery, Get() will be issued which will try to acquire @@ -1505,6 +1506,7 @@ class MemTableInserter : public WriteBatch::Handler { // So we disable merge in recovery if (moptions->max_successive_merges > 0 && db_ != nullptr && recovering_log_number_ == 0) { + assert(!concurrent_memtable_writes_); LookupKey lkey(key, sequence_); // Count the number of successive merges at the head @@ -1550,6 +1552,7 @@ class MemTableInserter : public WriteBatch::Handler { perform_merge = false; } else { // 3) Add value to memtable + assert(!concurrent_memtable_writes_); bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); @@ -1562,7 +1565,9 @@ class MemTableInserter : public WriteBatch::Handler { if (!perform_merge) { // Add merge operator to memtable - bool mem_res = mem->Add(sequence_, kTypeMerge, key, value); + bool mem_res = + mem->Add(sequence_, kTypeMerge, key, value, + concurrent_memtable_writes_, get_post_process_info(mem)); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); ret_status = Status::TryAgain("key+seq exists"); diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index cb880560efc..7f2b20d892f 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -124,6 +124,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { {false, false, true, false, true}, }; + for (auto& unordered_write : {true, false}) { for (auto& seq_per_batch : {true, false}) { for (auto& two_queues : {true, false}) { for (auto& allow_parallel : {true, false}) { @@ -133,15 +134,22 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { for (auto& write_group : write_scenarios) { Options options; options.create_if_missing = true; + options.unordered_write = unordered_write; options.allow_concurrent_memtable_write = allow_parallel; options.enable_pipelined_write = enable_pipelined_write; options.two_write_queues = two_queues; + // Skip unsupported combinations if (options.enable_pipelined_write && seq_per_batch) { - // This combination is not supported continue; } if (options.enable_pipelined_write && options.two_write_queues) { - // This combination is not supported + continue; + } + if (options.unordered_write && + !options.allow_concurrent_memtable_write) { + continue; + } + if (options.unordered_write && options.enable_pipelined_write) { continue; } @@ -358,8 +366,9 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { } } } -} -} + } + } + } } TEST_F(WriteCallbackTest, WriteCallBackTest) { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index ed0709d22a1..5e75dd70964 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -845,6 +845,8 @@ rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( + rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( rocksdb_options_t*, uint32_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index a1071f62ec7..c8b4cc538d9 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -893,6 +893,31 @@ struct DBOptions { // Default: false bool enable_pipelined_write = false; + // Setting unordered_write to true trades higher write throughput with + // relaxing the immutability guarantee of snapshots. This violates the + // repeatability one expects from ::Get from a snapshot, as well as + // ::MultiGet and Iterator's consistent-point-in-time view property. + // If the application cannot tolerate the relaxed guarantees, it can implement + // its own mechanisms to work around that and yet benefit from the higher + // throughput. Using TransactionDB with WRITE_PREPARED write policy is one way + // to achieve immutable snapshots despite unordered_write. + // + // By default, i.e., when it is false, rocksdb does not advance the sequence + // number for new snapshots unless all the writes with lower sequence numbers + // are already finished. This provides the immutability that we except from + // snapshots. Moreover, since Iterator and MultiGet internally depend on + // snapshots, the snapshot immutability results into Iterator and MultiGet + // offering consistent-point-in-time view. If set to true, although + // Read-Your-Own-Write property is still provided, the snapshot immutability + // property is relaxed: the writes issued after the snapshot is obtained (with + // larger sequence numbers) will be still not visible to the reads from that + // snapshot, however, there still might be pending writes (with lower sequence + // number) that will change the state visible to the snapshot after they are + // landed to the memtable. + // + // Default: false + bool unordered_write = false; + // If true, allow multi-writers to update mem tables in parallel. // Only some memtable_factory-s support concurrent writes; currently it // is implemented only for SkipListFactory. Concurrent memtable writes diff --git a/options/db_options.cc b/options/db_options.cc index 83f1a18b042..e180238f433 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -67,6 +67,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) listeners(options.listeners), enable_thread_tracking(options.enable_thread_tracking), enable_pipelined_write(options.enable_pipelined_write), + unordered_write(options.unordered_write), allow_concurrent_memtable_write(options.allow_concurrent_memtable_write), enable_write_thread_adaptive_yield( options.enable_write_thread_adaptive_yield), @@ -185,6 +186,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { enable_thread_tracking); ROCKS_LOG_HEADER(log, " Options.enable_pipelined_write: %d", enable_pipelined_write); + ROCKS_LOG_HEADER(log, " Options.unordered_write: %d", + unordered_write); ROCKS_LOG_HEADER(log, " Options.allow_concurrent_memtable_write: %d", allow_concurrent_memtable_write); ROCKS_LOG_HEADER(log, " Options.enable_write_thread_adaptive_yield: %d", diff --git a/options/db_options.h b/options/db_options.h index 8d02003623e..67b26786f5e 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -60,6 +60,7 @@ struct ImmutableDBOptions { std::vector> listeners; bool enable_thread_tracking; bool enable_pipelined_write; + bool unordered_write; bool allow_concurrent_memtable_write; bool enable_write_thread_adaptive_yield; uint64_t write_thread_max_yield_usec; diff --git a/options/options_helper.cc b/options/options_helper.cc index a973bbfde51..c33c2be6fb7 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -103,6 +103,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.enable_thread_tracking = immutable_db_options.enable_thread_tracking; options.delayed_write_rate = mutable_db_options.delayed_write_rate; options.enable_pipelined_write = immutable_db_options.enable_pipelined_write; + options.unordered_write = immutable_db_options.unordered_write; options.allow_concurrent_memtable_write = immutable_db_options.allow_concurrent_memtable_write; options.enable_write_thread_adaptive_yield = @@ -1583,6 +1584,9 @@ std::unordered_map {"enable_pipelined_write", {offsetof(struct DBOptions, enable_pipelined_write), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"unordered_write", + {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean, + OptionVerificationType::kNormal, false, 0}}, {"allow_concurrent_memtable_write", {offsetof(struct DBOptions, allow_concurrent_memtable_write), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 2d6cc11c02e..79a4fa81475 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -279,6 +279,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "advise_random_on_open=true;" "fail_if_options_file_error=false;" "enable_pipelined_write=false;" + "unordered_write=false;" "allow_concurrent_memtable_write=true;" "wal_recovery_mode=kPointInTimeRecovery;" "enable_write_thread_adaptive_yield=true;" diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 47fe8e1b0e3..790a2c99ecc 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -227,7 +227,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder( } Status BlockBasedTableFactory::SanitizeOptions( - const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const { + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && cf_opts.prefix_extractor == nullptr) { return Status::InvalidArgument( @@ -268,6 +268,12 @@ Status BlockBasedTableFactory::SanitizeOptions( "data_block_hash_table_util_ratio should be greater than 0 when " "data_block_index_type is set to kDataBlockBinaryAndHash"); } + if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { + // TODO(myabandeh): support it + return Status::InvalidArgument( + "max_successive_merges larger than 0 is currently inconsistent with " + "unordered_write"); + } return Status::OK(); } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b2562f4e539..b806fff8980 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -891,6 +891,9 @@ DEFINE_uint64(delayed_write_rate, 8388608u, DEFINE_bool(enable_pipelined_write, true, "Allow WAL and memtable writes to be pipelined"); +DEFINE_bool(unordered_write, false, + "Allow WAL and memtable writes to be pipelined"); + DEFINE_bool(allow_concurrent_memtable_write, true, "Allow multi-writers to update mem tables in parallel."); @@ -3552,6 +3555,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { options.enable_write_thread_adaptive_yield = FLAGS_enable_write_thread_adaptive_yield; options.enable_pipelined_write = FLAGS_enable_pipelined_write; + options.unordered_write = FLAGS_unordered_write; options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec; options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec; options.rate_limit_delay_max_milliseconds = diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 8eb21777a99..05973e83aea 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -221,9 +221,18 @@ Status TransactionDB::Open( std::vector* handles, TransactionDB** dbptr) { Status s; DB* db = nullptr; + if (txn_db_options.write_policy == WRITE_COMMITTED && + db_options.unordered_write) { + return Status::NotSupported( + "WRITE_COMMITTED is incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_UNPREPARED && + db_options.unordered_write) { + // TODO(lth): support it + return Status::NotSupported( + "WRITE_UNPREPARED is currently incompatible with unordered_writes"); + } - ROCKS_LOG_WARN(db_options.info_log, "Transaction write_policy is %" PRId32, - static_cast(txn_db_options.write_policy)); std::vector column_families_copy = column_families; std::vector compaction_enabled_cf_indices; DBOptions db_options_2pc = db_options; @@ -238,6 +247,9 @@ Status TransactionDB::Open( s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db, use_seq_per_batch, use_batch_per_txn); if (s.ok()) { + ROCKS_LOG_WARN(db->GetDBOptions().info_log, + "Transaction write_policy is %" PRId32, + static_cast(txn_db_options.write_policy)); s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles, dbptr); } diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 1a5bf2d6644..997a5abe2d8 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -42,40 +42,48 @@ namespace rocksdb { INSTANTIATE_TEST_CASE_P( DBAsBaseDB, TransactionTest, - ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED), - std::make_tuple(false, true, WRITE_COMMITTED), - std::make_tuple(false, false, WRITE_PREPARED), - std::make_tuple(false, true, WRITE_PREPARED), - std::make_tuple(false, false, WRITE_UNPREPARED), - std::make_tuple(false, true, WRITE_UNPREPARED))); + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); INSTANTIATE_TEST_CASE_P( DBAsBaseDB, TransactionStressTest, - ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED), - std::make_tuple(false, true, WRITE_COMMITTED), - std::make_tuple(false, false, WRITE_PREPARED), - std::make_tuple(false, true, WRITE_PREPARED), - std::make_tuple(false, false, WRITE_UNPREPARED), - std::make_tuple(false, true, WRITE_UNPREPARED))); + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); INSTANTIATE_TEST_CASE_P( StackableDBAsBaseDB, TransactionTest, - ::testing::Values(std::make_tuple(true, true, WRITE_COMMITTED), - std::make_tuple(true, true, WRITE_PREPARED), - std::make_tuple(true, true, WRITE_UNPREPARED))); + ::testing::Values( + std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite))); // MySQLStyleTransactionTest takes far too long for valgrind to run. #ifndef ROCKSDB_VALGRIND_RUN INSTANTIATE_TEST_CASE_P( MySQLStyleTransactionTest, MySQLStyleTransactionTest, - ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED, false), - std::make_tuple(false, true, WRITE_COMMITTED, false), - std::make_tuple(false, false, WRITE_PREPARED, false), - std::make_tuple(false, false, WRITE_PREPARED, true), - std::make_tuple(false, true, WRITE_PREPARED, false), - std::make_tuple(false, true, WRITE_PREPARED, true), - std::make_tuple(false, false, WRITE_UNPREPARED, false), - std::make_tuple(false, false, WRITE_UNPREPARED, true), - std::make_tuple(false, true, WRITE_UNPREPARED, false), - std::make_tuple(false, true, WRITE_UNPREPARED, true))); + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true))); #endif // ROCKSDB_VALGRIND_RUN TEST_P(TransactionTest, DoubleEmptyWrite) { @@ -5646,7 +5654,7 @@ TEST_P(TransactionTest, DuplicateKeys) { } // do_rollback } // do_prepare - { + if (!options.unordered_write) { // Also test with max_successive_merges > 0. max_successive_merges will not // affect our algorithm for duplicate key insertion but we add the test to // verify that. @@ -5697,6 +5705,7 @@ TEST_P(TransactionTest, DuplicateKeys) { std::unique_ptr comp_gc(new ThreeBytewiseComparator()); cf_options.comparator = comp_gc.get(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); delete cf_handle; std::vector cfds{ diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 33b2c51ea2f..b4254870951 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -39,6 +39,8 @@ namespace rocksdb { // Return true if the ith bit is set in combination represented by comb bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); } +enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite }; + class TransactionTestBase : public ::testing::Test { public: TransactionDB* db; @@ -50,11 +52,13 @@ class TransactionTestBase : public ::testing::Test { bool use_stackable_db_; TransactionTestBase(bool use_stackable_db, bool two_write_queue, - TxnDBWritePolicy write_policy) + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) : db(nullptr), env(nullptr), use_stackable_db_(use_stackable_db) { options.create_if_missing = true; options.max_write_buffer_number = 2; options.write_buffer_size = 4 * 1024; + options.unordered_write = write_ordering == kUnorderedWrite; options.level0_file_num_compaction_trigger = 2; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); env = new FaultInjectionTestEnv(Env::Default()); @@ -352,6 +356,9 @@ class TransactionTestBase : public ::testing::Test { Transaction* txn; txn_db_options.write_policy = from_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } ReOpen(); for (int i = 0; i < 1024; i++) { @@ -400,6 +407,9 @@ class TransactionTestBase : public ::testing::Test { } // for i txn_db_options.write_policy = to_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } auto db_impl = reinterpret_cast(db->GetRootDB()); // Before upgrade/downgrade the WAL must be emptied if (empty_wal) { @@ -437,13 +447,14 @@ class TransactionTestBase : public ::testing::Test { } }; -class TransactionTest : public TransactionTestBase, - virtual public ::testing::WithParamInterface< - std::tuple> { +class TransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { public: TransactionTest() : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), - std::get<2>(GetParam())){}; + std::get<2>(GetParam()), std::get<3>(GetParam())){}; }; class TransactionStressTest : public TransactionTest {}; @@ -451,12 +462,12 @@ class TransactionStressTest : public TransactionTest {}; class MySQLStyleTransactionTest : public TransactionTestBase, virtual public ::testing::WithParamInterface< - std::tuple> { + std::tuple> { public: MySQLStyleTransactionTest() : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), - std::get<2>(GetParam())), - with_slow_threads_(std::get<3>(GetParam())) { + std::get<2>(GetParam()), std::get<3>(GetParam())), + with_slow_threads_(std::get<4>(GetParam())) { if (with_slow_threads_ && (txn_db_options.write_policy == WRITE_PREPARED || txn_db_options.write_policy == WRITE_UNPREPARED)) { diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index f2f3f30e26e..d5a03cd0408 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -342,8 +342,10 @@ class WritePreparedTxnDBMock : public WritePreparedTxnDB { class WritePreparedTransactionTestBase : public TransactionTestBase { public: WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue, - TxnDBWritePolicy write_policy) - : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){}; + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + write_ordering){}; protected: void UpdateTransactionDBOptions(size_t snapshot_cache_bits, @@ -518,26 +520,26 @@ class WritePreparedTransactionTestBase : public TransactionTestBase { class WritePreparedTransactionTest : public WritePreparedTransactionTestBase, virtual public ::testing::WithParamInterface< - std::tuple> { + std::tuple> { public: WritePreparedTransactionTest() - : WritePreparedTransactionTestBase(std::get<0>(GetParam()), - std::get<1>(GetParam()), - std::get<2>(GetParam())){}; + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; }; #ifndef ROCKSDB_VALGRIND_RUN class SnapshotConcurrentAccessTest : public WritePreparedTransactionTestBase, - virtual public ::testing::WithParamInterface< - std::tuple> { + virtual public ::testing::WithParamInterface> { public: SnapshotConcurrentAccessTest() - : WritePreparedTransactionTestBase(std::get<0>(GetParam()), - std::get<1>(GetParam()), - std::get<2>(GetParam())), - split_id_(std::get<3>(GetParam())), - split_cnt_(std::get<4>(GetParam())){}; + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())){}; protected: // A test is split into split_cnt_ tests, each identified with split_id_ where @@ -549,15 +551,15 @@ class SnapshotConcurrentAccessTest class SeqAdvanceConcurrentTest : public WritePreparedTransactionTestBase, - virtual public ::testing::WithParamInterface< - std::tuple> { + virtual public ::testing::WithParamInterface> { public: SeqAdvanceConcurrentTest() - : WritePreparedTransactionTestBase(std::get<0>(GetParam()), - std::get<1>(GetParam()), - std::get<2>(GetParam())), - split_id_(std::get<3>(GetParam())), - split_cnt_(std::get<4>(GetParam())){}; + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())){}; protected: // A test is split into split_cnt_ tests, each identified with split_id_ where @@ -568,81 +570,152 @@ class SeqAdvanceConcurrentTest INSTANTIATE_TEST_CASE_P( WritePreparedTransactionTest, WritePreparedTransactionTest, - ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED), - std::make_tuple(false, true, WRITE_PREPARED))); + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite))); #ifndef ROCKSDB_VALGRIND_RUN INSTANTIATE_TEST_CASE_P( TwoWriteQueues, SnapshotConcurrentAccessTest, - ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 20), - std::make_tuple(false, true, WRITE_PREPARED, 1, 20), - std::make_tuple(false, true, WRITE_PREPARED, 2, 20), - std::make_tuple(false, true, WRITE_PREPARED, 3, 20), - std::make_tuple(false, true, WRITE_PREPARED, 4, 20), - std::make_tuple(false, true, WRITE_PREPARED, 5, 20), - std::make_tuple(false, true, WRITE_PREPARED, 6, 20), - std::make_tuple(false, true, WRITE_PREPARED, 7, 20), - std::make_tuple(false, true, WRITE_PREPARED, 8, 20), - std::make_tuple(false, true, WRITE_PREPARED, 9, 20), - std::make_tuple(false, true, WRITE_PREPARED, 10, 20), - std::make_tuple(false, true, WRITE_PREPARED, 11, 20), - std::make_tuple(false, true, WRITE_PREPARED, 12, 20), - std::make_tuple(false, true, WRITE_PREPARED, 13, 20), - std::make_tuple(false, true, WRITE_PREPARED, 14, 20), - std::make_tuple(false, true, WRITE_PREPARED, 15, 20), - std::make_tuple(false, true, WRITE_PREPARED, 16, 20), - std::make_tuple(false, true, WRITE_PREPARED, 17, 20), - std::make_tuple(false, true, WRITE_PREPARED, 18, 20), - std::make_tuple(false, true, WRITE_PREPARED, 19, 20))); + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20), + + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20))); INSTANTIATE_TEST_CASE_P( OneWriteQueue, SnapshotConcurrentAccessTest, - ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 20), - std::make_tuple(false, false, WRITE_PREPARED, 1, 20), - std::make_tuple(false, false, WRITE_PREPARED, 2, 20), - std::make_tuple(false, false, WRITE_PREPARED, 3, 20), - std::make_tuple(false, false, WRITE_PREPARED, 4, 20), - std::make_tuple(false, false, WRITE_PREPARED, 5, 20), - std::make_tuple(false, false, WRITE_PREPARED, 6, 20), - std::make_tuple(false, false, WRITE_PREPARED, 7, 20), - std::make_tuple(false, false, WRITE_PREPARED, 8, 20), - std::make_tuple(false, false, WRITE_PREPARED, 9, 20), - std::make_tuple(false, false, WRITE_PREPARED, 10, 20), - std::make_tuple(false, false, WRITE_PREPARED, 11, 20), - std::make_tuple(false, false, WRITE_PREPARED, 12, 20), - std::make_tuple(false, false, WRITE_PREPARED, 13, 20), - std::make_tuple(false, false, WRITE_PREPARED, 14, 20), - std::make_tuple(false, false, WRITE_PREPARED, 15, 20), - std::make_tuple(false, false, WRITE_PREPARED, 16, 20), - std::make_tuple(false, false, WRITE_PREPARED, 17, 20), - std::make_tuple(false, false, WRITE_PREPARED, 18, 20), - std::make_tuple(false, false, WRITE_PREPARED, 19, 20))); + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20), + + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 10, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 11, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 12, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 13, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 14, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 15, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 16, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 17, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 18, 20), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 19, + 20))); INSTANTIATE_TEST_CASE_P( TwoWriteQueues, SeqAdvanceConcurrentTest, - ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 10), - std::make_tuple(false, true, WRITE_PREPARED, 1, 10), - std::make_tuple(false, true, WRITE_PREPARED, 2, 10), - std::make_tuple(false, true, WRITE_PREPARED, 3, 10), - std::make_tuple(false, true, WRITE_PREPARED, 4, 10), - std::make_tuple(false, true, WRITE_PREPARED, 5, 10), - std::make_tuple(false, true, WRITE_PREPARED, 6, 10), - std::make_tuple(false, true, WRITE_PREPARED, 7, 10), - std::make_tuple(false, true, WRITE_PREPARED, 8, 10), - std::make_tuple(false, true, WRITE_PREPARED, 9, 10))); + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10))); INSTANTIATE_TEST_CASE_P( OneWriteQueue, SeqAdvanceConcurrentTest, - ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 10), - std::make_tuple(false, false, WRITE_PREPARED, 1, 10), - std::make_tuple(false, false, WRITE_PREPARED, 2, 10), - std::make_tuple(false, false, WRITE_PREPARED, 3, 10), - std::make_tuple(false, false, WRITE_PREPARED, 4, 10), - std::make_tuple(false, false, WRITE_PREPARED, 5, 10), - std::make_tuple(false, false, WRITE_PREPARED, 6, 10), - std::make_tuple(false, false, WRITE_PREPARED, 7, 10), - std::make_tuple(false, false, WRITE_PREPARED, 8, 10), - std::make_tuple(false, false, WRITE_PREPARED, 9, 10))); + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10), + + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 10), + std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 10))); #endif // ROCKSDB_VALGRIND_RUN TEST_P(WritePreparedTransactionTest, CommitMapTest) { diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index 9aee33b078f..914f3f581e4 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -20,7 +20,8 @@ class WriteUnpreparedTransactionTestBase : public TransactionTestBase { WriteUnpreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue, TxnDBWritePolicy write_policy) - : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){} + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + kOrderedWrite) {} }; class WriteUnpreparedTransactionTest From 6492430eaf1a13730eec81321528558cbf486c96 Mon Sep 17 00:00:00 2001 From: anand76 Date: Tue, 14 May 2019 11:54:52 -0700 Subject: [PATCH 028/572] Fix a bug in db_stress and an incorrect assertion in FilePickerMultiGet (#5301) Summary: This PR has two fixes for crash test failures - 1. Fix a bug in TestMultiGet() in db_stress that was passing list of key to MultiGet() in the wrong order, thus ensuring that actual values don't match expected values 2. Remove an incorrect assertion in FilePickerMultiGet::GetNextFileInLevelWithKeys() that checks that files in a level are in sorted order. This is not true with MultiGet(), especially if there are duplicate keys and we may have to go back one file for the next key. Furthermore, this assertion makes more sense when a new version is created, rather than at lookup time Test - asan_crash and ubsan_crash tests Pull Request resolved: https://github.com/facebook/rocksdb/pull/5301 Differential Revision: D15337383 Pulled By: anand1976 fbshipit-source-id: 35092cb15bbc1700e5e823cbe07bfa62f1e9e6c6 --- db/version_set.cc | 41 ++--------------------------------------- tools/db_stress.cc | 28 ++++++++++++++++------------ 2 files changed, 18 insertions(+), 51 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 84302556e66..f0dfe765871 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -353,7 +353,7 @@ class FilePickerMultiGet { struct FilePickerContext; public: - FilePickerMultiGet(std::vector* files, MultiGetRange* range, + FilePickerMultiGet(MultiGetRange* range, autovector* file_levels, unsigned int num_levels, FileIndexer* file_indexer, const Comparator* user_comparator, @@ -368,18 +368,12 @@ class FilePickerMultiGet { maybe_repeat_key_(false), current_level_range_(*range, range->begin(), range->end()), current_file_range_(*range, range->begin(), range->end()), -#ifndef NDEBUG - files_(files), -#endif level_files_brief_(file_levels), is_hit_file_last_in_level_(false), curr_file_level_(nullptr), file_indexer_(file_indexer), user_comparator_(user_comparator), internal_comparator_(internal_comparator) { -#ifdef NDEBUG - (void)files; -#endif for (auto iter = range_->begin(); iter != range_->end(); ++iter) { fp_ctx_array_[iter.index()] = FilePickerContext(0, FileIndexer::kLevelMaxIndex); @@ -485,25 +479,6 @@ class FilePickerMultiGet { } else { file_hit = true; } -#ifndef NDEBUG - // Sanity check to make sure that the files are correctly sorted - if (f != prev_file_) { - if (prev_file_) { - if (curr_level_ != 0) { - int comp_sign = internal_comparator_->Compare( - prev_file_->largest_key, f->smallest_key); - assert(comp_sign < 0); - } else if (fp_ctx.curr_index_in_curr_level > 0) { - // level == 0, the current file cannot be newer than the previous - // one. Use compressed data structure, has no attribute seqNo - assert(!NewestFirstBySeqNo( - files_[0][fp_ctx.curr_index_in_curr_level], - files_[0][fp_ctx.curr_index_in_curr_level - 1])); - } - } - prev_file_ = f; - } -#endif if (cmp_largest == 0) { // cmp_largest is 0, which means the next key will not be in this // file, so stop looking further. Also don't increment megt_iter_ @@ -635,9 +610,6 @@ class FilePickerMultiGet { bool maybe_repeat_key_; MultiGetRange current_level_range_; MultiGetRange current_file_range_; -#ifndef NDEBUG - std::vector* files_; -#endif autovector* level_files_brief_; bool search_ended_; bool is_hit_file_last_in_level_; @@ -645,9 +617,6 @@ class FilePickerMultiGet { FileIndexer* file_indexer_; const Comparator* user_comparator_; const InternalKeyComparator* internal_comparator_; -#ifndef NDEBUG - FdWithKeyRange* prev_file_; -#endif // Setup local variables to search next level. // Returns false if there are no more levels to search. @@ -656,9 +625,6 @@ class FilePickerMultiGet { MultiGetRange::Iterator mget_iter = current_level_range_.begin(); if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < curr_file_level_->num_files) { -#ifndef NDEBUG - prev_file_ = nullptr; -#endif batch_iter_prev_ = current_level_range_.begin(); batch_iter_ = current_level_range_.begin(); return true; @@ -754,9 +720,6 @@ class FilePickerMultiGet { fp_ctx.curr_index_in_curr_level = start_index; } if (level_contains_keys) { -#ifndef NDEBUG - prev_file_ = nullptr; -#endif batch_iter_prev_ = current_level_range_.begin(); batch_iter_ = current_level_range_.begin(); return true; @@ -1800,7 +1763,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, MultiGetRange file_picker_range(*range, range->begin(), range->end()); FilePickerMultiGet fp( - storage_info_.files_, &file_picker_range, + &file_picker_range, &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index c6959802be3..6eb974e0934 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -3609,36 +3609,40 @@ class BatchedOpsStressTest : public StressTest { const std::vector& rand_column_families, const std::vector& rand_keys) { size_t num_keys = rand_keys.size(); - std::vector statuses(num_keys); - std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; - for (int key = 0; key < 10; ++key) { + std::vector ret_status(num_keys); + std::array keys = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + size_t num_prefixes = keys.size(); + for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) { std::vector key_slices; - std::vector values(num_keys); + std::vector values(num_prefixes); + std::vector statuses(num_prefixes); ReadOptions readoptionscopy = readoptions; readoptionscopy.snapshot = db_->GetSnapshot(); std::vector key_str; - key_str.reserve(num_keys); - key_slices.reserve(num_keys); + key_str.reserve(num_prefixes); + key_slices.reserve(num_prefixes); std::string from_db; ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; - for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) { + for (size_t key = 0; key < num_prefixes; ++key) { key_str.emplace_back(keys[key] + Key(rand_keys[rand_key])); key_slices.emplace_back(key_str.back()); } - db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(), + db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(), values.data(), statuses.data()); - for (size_t i = 0; i < num_keys; i++) { + for (size_t i = 0; i < num_prefixes; i++) { Status s = statuses[i]; if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); thread->stats.AddErrors(1); + ret_status[rand_key] = s; // we continue after error rather than exiting so that we can // find more errors if any } else if (s.IsNotFound()) { thread->stats.AddGets(1, 0); + ret_status[rand_key] = s; } else { - char expected_prefix = (keys[key])[0]; + char expected_prefix = (keys[i])[0]; char actual_prefix = (values[i])[0]; if (actual_prefix != expected_prefix) { fprintf(stderr, "error expected prefix = %c actual = %c\n", @@ -3655,7 +3659,7 @@ class BatchedOpsStressTest : public StressTest { db_->ReleaseSnapshot(readoptionscopy.snapshot); // Now that we retrieved all values, check that they all match - for (size_t i = 1; i < num_keys; i++) { + for (size_t i = 1; i < num_prefixes; i++) { if (values[i] != values[0]) { fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", key_str[i].c_str(), @@ -3667,7 +3671,7 @@ class BatchedOpsStressTest : public StressTest { } } - return statuses; + return ret_status; } // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P From 3c3252a06a77c5f6877392b882014dc8c8b2bd8f Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 15 May 2019 11:18:34 -0700 Subject: [PATCH 029/572] Fix tsan complaint in ConcurrentMergeWrite test (#5308) Summary: The test was not using separate MemTablePostProcessInfo per memetable insert thread and thus tsan was complaining about data race. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5308 Differential Revision: D15356420 Pulled By: maysamyabandeh fbshipit-source-id: 46c2f2d19fb02c3c775b587aa09ca9c0dae6ed04 --- db/db_memtable_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index a212c981286..184c6f53b11 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -222,7 +222,6 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { options.allow_concurrent_memtable_write = true; ImmutableCFOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); - MemTablePostProcessInfo post_process_info; MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); @@ -234,21 +233,23 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { // Write Merge concurrently rocksdb::port::Thread write_thread1([&]() { + MemTablePostProcessInfo post_process_info1; std::string v1; for (int seq = 1; seq < num_ops / 2; seq++) { PutFixed64(&v1, seq); bool res1 = - mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info); + mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1); ASSERT_TRUE(res1); v1.clear(); } }); rocksdb::port::Thread write_thread2([&]() { + MemTablePostProcessInfo post_process_info2; std::string v2; for (int seq = num_ops / 2; seq < num_ops; seq++) { PutFixed64(&v2, seq); bool res2 = - mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info); + mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2); ASSERT_TRUE(res2); v2.clear(); } From 189e711b3740ae3fbe2eeb8cd5a12419346bd627 Mon Sep 17 00:00:00 2001 From: Andres Suarez Date: Wed, 15 May 2019 11:28:39 -0700 Subject: [PATCH 030/572] Text lint all .gitignore files Reviewed By: scottrice, pallotron Differential Revision: D15353820 fbshipit-source-id: 74f9eaadc90363a958692259f5cb66cef91ac8ef --- docs/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/.gitignore b/docs/.gitignore index e48dc98be89..3938549cbe6 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -6,4 +6,3 @@ _site .sass-cache *.psd *~ - From a42757607d5c7bc503958fd8027a4f9ef5cfceaf Mon Sep 17 00:00:00 2001 From: Thomas Fersch Date: Wed, 15 May 2019 13:14:18 -0700 Subject: [PATCH 031/572] Use pre-increment instead of post-increment for iterators (#5296) Summary: Google C++ style guide indicates pre-increment should be used for iterators: https://google.github.io/styleguide/cppguide.html#Preincrement_and_Predecrement. Replaced all instances of ' it++' by ' ++it' (where type is iterator). So this covers the cases where iterators are named 'it'. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5296 Differential Revision: D15301256 Pulled By: tfersch fbshipit-source-id: 2803483c1392504ad3b281d21db615429c71114b --- db/compaction_job.cc | 2 +- db/db_impl.h | 2 +- db/db_impl_compaction_flush.cc | 10 +++++----- db/memtable_list.cc | 2 +- db/prefix_test.cc | 2 +- utilities/transactions/pessimistic_transaction_db.cc | 4 ++-- utilities/transactions/transaction_test.cc | 4 ++-- .../transactions/write_prepared_transaction_test.cc | 6 +++--- utilities/transactions/write_prepared_txn_db.cc | 4 ++-- utilities/transactions/write_unprepared_txn_db.cc | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 00386a99ad4..fb77431fddc 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -519,7 +519,7 @@ void CompactionJob::GenSubcompactionBoundaries() { auto* v = compact_->compaction->input_version(); for (auto it = bounds.begin();;) { const Slice a = *it; - it++; + ++it; if (it == bounds.end()) { break; diff --git a/db/db_impl.h b/db/db_impl.h index 0ee5d82b56c..c4fae9a6ad5 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -725,7 +725,7 @@ class DBImpl : public DB { void DeleteAllRecoveredTransactions() { for (auto it = recovered_transactions_.begin(); - it != recovered_transactions_.end(); it++) { + it != recovered_transactions_.end(); ++it) { delete it->second; } recovered_transactions_.clear(); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 3fbf24e49f8..900ea4acdcd 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -2794,7 +2794,7 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { it = manual_compaction_dequeue_.erase(it); return; } - it++; + ++it; } assert(false); return; @@ -2815,7 +2815,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { bool seen = false; while (it != manual_compaction_dequeue_.end()) { if (m == (*it)) { - it++; + ++it; seen = true; continue; } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { @@ -2824,7 +2824,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { // and (*it) is ahead in the queue and is not yet in progress return true; } - it++; + ++it; } return false; } @@ -2842,7 +2842,7 @@ bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { // in progress return true; } - it++; + ++it; } return false; } @@ -2855,7 +2855,7 @@ bool DBImpl::HasExclusiveManualCompaction() { if ((*it)->exclusive) { return true; } - it++; + ++it; } return false; } diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 69beb77f965..21b44b1798a 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -437,7 +437,7 @@ Status MemTableList::TryInstallMemtableFlushResults( ++mem_id; } } else { - for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { MemTable* m = *it; // commit failed. setup state so that we can flush again. ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64 diff --git a/db/prefix_test.cc b/db/prefix_test.cc index ac854cb3dbd..be420ded183 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -751,7 +751,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) { for (size_t k = 0; k < 9; k++) { if (rnd.OneIn(2) || it == whole_map.begin()) { iter->Next(); - it++; + ++it; if (FLAGS_enable_print) { std::cout << "Next >> "; } diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 05973e83aea..ecf6d2ff387 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -121,7 +121,7 @@ Status PessimisticTransactionDB::Initialize( assert(dbimpl != nullptr); auto rtrxs = dbimpl->recovered_transactions(); - for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) { + for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) { auto recovered_trx = it->second; assert(recovered_trx); assert(recovered_trx->batches_.size() == 1); @@ -594,7 +594,7 @@ void PessimisticTransactionDB::GetAllPreparedTransactions( assert(transv); transv->clear(); std::lock_guard lock(name_map_mutex_); - for (auto it = transactions_.begin(); it != transactions_.end(); it++) { + for (auto it = transactions_.begin(); it != transactions_.end(); ++it) { if (it->second->GetState() == Transaction::PREPARED) { transv->push_back(it->second); } diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 997a5abe2d8..2433af82637 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -567,7 +567,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) { TransactionID leaf_id = dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root; - for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) { + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { auto dl_node = *it; ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id); ASSERT_EQ(dl_node.m_cf_id, 0); @@ -774,7 +774,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) { } // Iterates backwards over path verifying decreasing txn_ids. - for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) { + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { auto dl_node = *it; ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1); ASSERT_EQ(dl_node.m_cf_id, 0); diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index d5a03cd0408..6bad81db0ee 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1099,7 +1099,7 @@ TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccessTest) { new_snapshots.push_back(snapshots[old_snapshots.size() + i]); } for (auto it = common_snapshots.begin(); it != common_snapshots.end(); - it++) { + ++it) { auto snapshot = *it; // Create a commit entry that is around the snapshot and thus should // be not be discarded @@ -1166,12 +1166,12 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) { // b. delayed prepared should contain every txn <= max and prepared should // only contain txns > max auto it = initial_prepared.begin(); - for (; it != initial_prepared.end() && *it <= new_max; it++) { + for (; it != initial_prepared.end() && *it <= new_max; ++it) { ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it)); } ASSERT_TRUE(wp_db->delayed_prepared_.empty()); for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty(); - it++, wp_db->prepared_txns_.pop()) { + ++it, wp_db->prepared_txns_.pop()) { ASSERT_EQ(*it, wp_db->prepared_txns_.top()); } ASSERT_TRUE(it == initial_prepared.end()); diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 8a7883c0504..3b09cbbf7d6 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -798,7 +798,7 @@ void WritePreparedTxnDB::UpdateSnapshots( // afterwards. size_t i = 0; auto it = snapshots.begin(); - for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; it++, i++) { + for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) { snapshot_cache_[i].store(*it, std::memory_order_release); TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i); TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); @@ -812,7 +812,7 @@ void WritePreparedTxnDB::UpdateSnapshots( } #endif snapshots_.clear(); - for (; it != snapshots.end(); it++) { + for (; it != snapshots.end(); ++it) { // Insert them to a vector that is less efficient to access // concurrently snapshots_.push_back(*it); diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 4fcbfbc37c5..a1aeedf2e15 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -46,7 +46,7 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( }; // Iterate starting with largest sequence number. - for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); it++) { + for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) { auto last_visible_txn = it->first - 1; const auto& batch = it->second.batch_; WriteBatch rollback_batch; From da7c89d79d7033a53f30f82da3630ba3a0a77b8d Mon Sep 17 00:00:00 2001 From: Yuqi Gu Date: Wed, 15 May 2019 13:24:36 -0700 Subject: [PATCH 032/572] RocksDB Cmake changes for Arm64 CRC32 Optimization (#5304) Summary: Add CMake build for RocksDB CRC32 Optimization on Arm64. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5304 Differential Revision: D15355193 Pulled By: miasantreble fbshipit-source-id: 8d750a444274fbde14e510f51290631a369026b8 --- CMakeLists.txt | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4feee986c3..5bb0c089f2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,15 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") endif(HAS_ALTIVEC) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + CHECK_C_COMPILER_FLAG("-march=armv8-a+crc" HAS_ARMV8_CRC) + if(HAS_ARMV8_CRC) + message(STATUS " HAS_ARMV8_CRC yes") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc -Wno-unused-function") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc -Wno-unused-function") + endif(HAS_ARMV8_CRC) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) if(PORTABLE) @@ -213,7 +222,7 @@ else() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") else() - if(NOT HAVE_POWER8) + if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() endif() @@ -694,6 +703,11 @@ if(HAVE_POWER8) util/crc32c_ppc_asm.S) endif(HAVE_POWER8) +if(HAS_ARMV8_CRC) + list(APPEND SOURCES + util/crc32c_arm64.cc) +endif(HAS_ARMV8_CRC) + if(WIN32) list(APPEND SOURCES port/win/io_win.cc From ad27045d14871d7edbee606ec19108c89c974336 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 15 May 2019 13:36:01 -0700 Subject: [PATCH 033/572] Update HISTORY after cherrypicking a bug fix to 6.2 (#5309) Summary: After cherry-pick a bug fix to 6.2.fb branch, update the HISTORY.md file to reflect this change. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5309 Differential Revision: D15358002 Pulled By: riversand963 fbshipit-source-id: 5a60510ec6dd444ce5ffaefc69b2e4c38914a921 --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 919dea21133..9cf8a88da04 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,7 +13,6 @@ * Merging iterator to avoid child iterator reseek for some cases ### Bug Fixes -* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. ## 6.2.0 (4/30/2019) ### New Features @@ -34,6 +33,7 @@ * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys. * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor. * Close a WAL file before another thread deletes it. +* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. ## 6.1.1 (4/9/2019) ### New Features From f0e821619742a8e97521d035c7e527c21743530a Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 15 May 2019 13:48:59 -0700 Subject: [PATCH 034/572] WritePrepared: Fix deadlock in WriteRecoverableState (#5306) Summary: The recent improvement in https://github.com/facebook/rocksdb/pull/3661 could cause a deadlock: When writing recoverable state, we also commit its sequence number to commit table, which could result into evicting existing commit entry, which could result into advancing max_evicted_seq_, which would need to get snapshots from database, which requires obtaining db mutex. The patch releases db_mutex before calling the callback in WriteRecoverableState to avoid the potential deadlock. It also improves the stress tests to let the issue be manifested in the tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5306 Differential Revision: D15341458 Pulled By: maysamyabandeh fbshipit-source-id: 05dcbed7e21b789fd1e5fd5ee8eea08077162323 --- db/compaction_iterator.cc | 14 -------------- db/db_impl_write.cc | 12 +++++++++--- util/transaction_test_util.cc | 6 ++++++ utilities/transactions/transaction_test.cc | 3 +++ utilities/transactions/transaction_test.h | 1 + 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index bce0b82dbc7..ca55eef7123 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -493,20 +493,6 @@ void CompactionIterator::NextFromInput() { // in this snapshot. assert(last_sequence >= current_user_key_sequence_); - // Note2: if last_snapshot < current_user_key_snapshot, it can only - // mean last_snapshot is released between we process last value and - // this value, and findEarliestVisibleSnapshot returns the next snapshot - // as current_user_key_snapshot. In this case last value and current - // value are both in current_user_key_snapshot currently. - // Although last_snapshot is released we might still get a definitive - // response when key sequence number changes, e.g., when seq is determined - // too old and visible in all snapshots. - assert(last_snapshot == current_user_key_snapshot_ || - (snapshot_checker_ != nullptr && - snapshot_checker_->CheckInSnapshot(current_user_key_sequence_, - last_snapshot) != - SnapshotCheckerResult::kNotInSnapshot)); - ++iter_stats_.num_record_drop_hidden; // (A) input_->Next(); } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion && diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 733eb408a8d..f4c72e298ee 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -214,9 +214,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, WriteThread::WriteGroup write_group; bool in_parallel_group = false; uint64_t last_sequence = kMaxSequenceNumber; - if (!two_write_queues_) { - last_sequence = versions_->LastSequence(); - } mutex_.Lock(); @@ -231,6 +228,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_STOP(write_pre_and_post_process_time); status = PreprocessWrite(write_options, &need_log_sync, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } PERF_TIMER_START(write_pre_and_post_process_time); } @@ -1113,8 +1115,12 @@ Status DBImpl::WriteRecoverableState() { for (uint64_t sub_batch_seq = seq + 1; sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); status = recoverable_state_pre_release_callback_->Callback( sub_batch_seq, !DISABLE_MEMTABLE, no_log_num); + mutex_.Lock(); } } if (status.ok()) { diff --git a/util/transaction_test_util.cc b/util/transaction_test_util.cc index 30cff11e14d..bd2d6afdca0 100644 --- a/util/transaction_test_util.cc +++ b/util/transaction_test_util.cc @@ -205,6 +205,12 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Prepare of %" PRIu64 " %s (%s)", txn->GetId(), s.ToString().c_str(), txn->GetName().c_str()); + if (rand_->OneIn(20)) { + // This currently only tests the mechanics of writing commit time + // write batch so the exact values would not matter. + s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog"); + assert(s.ok()); + } db->GetDBOptions().env->SleepForMicroseconds( static_cast(cmt_delay_ms_ * 1000)); } diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 2433af82637..6c71b679d60 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -5096,6 +5096,9 @@ Status TransactionStressTestInserter( WriteOptions write_options; ReadOptions read_options; TransactionOptions txn_options; + if (rand->OneIn(2)) { + txn_options.use_only_the_last_commit_time_batch_for_recovery = true; + } // Inside the inserter we might also retake the snapshot. We do both since two // separte functions are engaged for each. txn_options.set_snapshot = rand->OneIn(2); diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index b4254870951..00fa6cf0364 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -477,6 +477,7 @@ class MySQLStyleTransactionTest // structures. txn_db_options.wp_snapshot_cache_bits = 1; txn_db_options.wp_commit_cache_bits = 10; + options.write_buffer_size = 1024; EXPECT_OK(ReOpen()); } }; From 468ca611052eb207cfa6f312c90be1aff9de48ba Mon Sep 17 00:00:00 2001 From: Raphael Bost Date: Wed, 15 May 2019 14:16:36 -0700 Subject: [PATCH 035/572] Break large file writes into 1GB chunks (#5213) Summary: This is a workaround for the issue described in #5169. It has been tested on a database with very large values, but not dedicated test has been added to the code base. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5213 Differential Revision: D15243116 Pulled By: siying fbshipit-source-id: e0c226a6cd71a60924dcd7ce7af74abcb4054484 --- env/io_posix.cc | 135 ++++++++++++++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 56 deletions(-) diff --git a/env/io_posix.cc b/env/io_posix.cc index 0f86c3ff93f..0ced06ff262 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -37,7 +37,7 @@ #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) #define F_LINUX_SPECIFIC_BASE 1024 -#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) #endif namespace rocksdb { @@ -58,6 +58,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) { namespace { +// On MacOS (and probably *BSD), the posix write and pwrite calls do not support +// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by +// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep +// the writes aligned. + +bool PosixWrite(int fd, const char* buf, size_t nbyte) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = write(fd, src, bytes_to_write); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + src += done; + } + return true; +} + +bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = pwrite(fd, src, bytes_to_write, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + offset += done; + src += done; + } + + return true; +} + size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { #ifdef OS_LINUX struct stat buf; @@ -180,7 +231,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) { return uintptr_t(ptr) % sector_size == 0; } -} +} // namespace #endif /* @@ -752,9 +803,9 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); int alloc_status = 0; if (allow_fallocate_) { - alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - static_cast(offset), static_cast(len)); + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -801,19 +852,13 @@ Status PosixWritableFile::Append(const Slice& data) { assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); } const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = write(fd_, src, left); - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError("While appending to file", filename_, errno); - } - left -= done; - src += done; + size_t nbytes = data.size(); + + if (!PosixWrite(fd_, src, nbytes)) { + return IOError("While appending to file", filename_, errno); } - filesize_ += data.size(); + + filesize_ += nbytes; return Status::OK(); } @@ -825,21 +870,12 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { } assert(offset <= std::numeric_limits::max()); const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = pwrite(fd_, src, left, static_cast(offset)); - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError("While pwrite to file at offset " + ToString(offset), - filename_, errno); - } - left -= done; - offset += done; - src += done; + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError("While pwrite to file at offset " + ToString(offset), + filename_, errno); } - filesize_ = offset; + filesize_ = offset + nbytes; return Status::OK(); } @@ -891,8 +927,8 @@ Status PosixWritableFile::Close() { // If not, we should hack it with FALLOC_FL_PUNCH_HOLE if (result == 0 && (file_stats.st_size + file_stats.st_blksize - 1) / - file_stats.st_blksize != - file_stats.st_blocks / (file_stats.st_blksize / 512)) { + file_stats.st_blksize != + file_stats.st_blocks / (file_stats.st_blksize / 512)) { IOSTATS_TIMER_GUARD(allocate_nanos); if (allow_fallocate_) { fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, @@ -942,10 +978,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { } #else (void)hint; -#endif // ROCKSDB_VALGRIND_RUN +#endif // ROCKSDB_VALGRIND_RUN #else (void)hint; -#endif // OS_LINUX +#endif // OS_LINUX } Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { @@ -974,9 +1010,9 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { - alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - static_cast(offset), static_cast(len)); + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -1037,24 +1073,11 @@ PosixRandomRWFile::~PosixRandomRWFile() { Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = pwrite(fd_, src, left, offset); - if (done < 0) { - // error while writing to file - if (errno == EINTR) { - // write was interrupted, try again. - continue; - } - return IOError( - "While write random read/write file at offset " + ToString(offset), - filename_, errno); - } - - // Wrote `done` bytes - left -= done; - offset += done; - src += done; + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError( + "While write random read/write file at offset " + ToString(offset), + filename_, errno); } return Status::OK(); From 8149bb9d6ab8ef55a30e9906f0bca8e6e0a42bec Mon Sep 17 00:00:00 2001 From: Dave Rigby Date: Wed, 15 May 2019 14:19:04 -0700 Subject: [PATCH 036/572] Pass OptionTypeInfo maps by const& (#5295) Summary: In options_helper.cc various functions take a const unordered_map of string -> TypeInfo for options handling. These functions pass by-value the (const) maps, resulting in unnecessary copies. Change to pass by reference. This results in a noticable reduction in the amount of time spent parsing options - in my case a set of unit tests using RocksDB which call SetOptions() to modify options see a ~25% runtime reduction. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5295 Differential Revision: D15296334 Pulled By: riversand963 fbshipit-source-id: 4d4be3db635264943607911b296dda27fd7ce1a7 --- options/options_helper.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/options/options_helper.cc b/options/options_helper.cc index c33c2be6fb7..dbee1636d9f 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -255,7 +255,7 @@ const std::string kNameMergeOperator = "merge_operator"; template Status GetStringFromStruct( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& delimiter); namespace { @@ -350,7 +350,7 @@ bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str, template bool SerializeStruct( const T& options, std::string* value, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { std::string opt_str; Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";"); if (!s.ok()) { @@ -363,7 +363,7 @@ bool SerializeStruct( template bool ParseSingleStructOption( const std::string& opt_val_str, T* options, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { size_t end = opt_val_str.find('='); std::string key = opt_val_str.substr(0, end); std::string value = opt_val_str.substr(end + 1); @@ -380,7 +380,7 @@ bool ParseSingleStructOption( template bool ParseStructOptions( const std::string& opt_str, T* options, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { assert(!opt_str.empty()); size_t start = 0; @@ -1092,7 +1092,7 @@ Status ParseColumnFamilyOption(const std::string& name, template bool SerializeSingleStructOption( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& name, const std::string& delimiter) { auto iter = type_info.find(name); if (iter == type_info.end()) { @@ -1112,7 +1112,7 @@ bool SerializeSingleStructOption( template Status GetStringFromStruct( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& delimiter) { assert(opt_string); opt_string->clear(); From 1583cb402eb6f52adac0261cb3766b47aac3078e Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 15 May 2019 15:13:44 -0700 Subject: [PATCH 037/572] Fix a flaky test with test sync point (#5310) Summary: If DB is opened with `avoid_unnecessary_blocking_io` being true, then `~ColumnFamilyHandleImpl` enqueues a purge request and schedules a background thread to perform the deletion. Without test sync point, whether the SST file is purged or not at a later point in time is not deterministic. If the SST does not exist, it will cause an assertion failure. How to reproduce: ``` $git checkout 6492430eaf1a13730eec81321528558cbf486c96 $make -j20 deletefile_test $gtest-parallel --repeat 1000 --worker 16 ./deletefile_test --gtest_filter=DeleteFileTest.BackgroundPurgeCFDropTest ``` The test may fail a few times. With changes made in this PR, repeat the above commands, and the test should not fail. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5310 Differential Revision: D15361136 Pulled By: riversand963 fbshipit-source-id: c4308d5f8da83472c893bf7f8ceed347fbfa850f --- db/deletefile_test.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 3ae464c5842..54bab847927 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -305,6 +305,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { &sleeping_task_after, Env::Priority::HIGH); // If background purge is enabled, the file should still be there. CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1"); // Execute background purges. sleeping_task_after.WakeUp(); @@ -318,6 +319,13 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { do_test(false); } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::BackgroundPurgeCFDropTest:1", + "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + options_.avoid_unnecessary_blocking_io = true; ASSERT_OK(ReopenDB(false)); { @@ -326,6 +334,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { } CloseDB(); + SyncPoint::GetInstance()->DisableProcessing(); } // This test is to reproduce a bug that read invalid ReadOption in iterator From 29a198564d097411ca4bf08ae061c35e91a22502 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 15 May 2019 15:57:04 -0700 Subject: [PATCH 038/572] Fixes for build_detect_platform Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5255 Differential Revision: D15246532 Pulled By: riversand963 fbshipit-source-id: 96a21509666152788fa2f956e865a6bed7c8f474 --- build_tools/build_detect_platform | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 057f77ec531..7f454bcca08 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -397,6 +397,7 @@ EOF #include int main() { size_t res = malloc_usable_size(0); + (void)res; return 0; } EOF @@ -411,6 +412,7 @@ EOF #include int main() { int x = PTHREAD_MUTEX_ADAPTIVE_NP; + (void)x; return 0; } EOF @@ -422,7 +424,7 @@ EOF if ! test $ROCKSDB_DISABLE_BACKTRACE; then # Test whether backtrace is available $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <> + #include int main() { void* frames[1]; backtrace_symbols(frames, backtrace(frames, 1)); @@ -480,6 +482,7 @@ EOF #include int main() { int cpuid = sched_getcpu(); + (void)cpuid; } EOF if [ "$?" = 0 ]; then @@ -515,7 +518,7 @@ fi if test "$USE_HDFS"; then if test -z "$JAVA_HOME"; then - echo "JAVA_HOME has to be set for HDFS usage." + echo "JAVA_HOME has to be set for HDFS usage." >&2 exit 1 fi HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS -I$HADOOP_HOME/include" @@ -553,12 +556,13 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < int main() { volatile uint32_t x = _mm_crc32_u32(0, 0); + (void)x; } EOF if [ "$?" = 0 ]; then COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42" elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" + echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2 exit 1 fi @@ -570,12 +574,13 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <&2 exit 1 fi @@ -589,6 +594,7 @@ if [ "$PLATFORM" != IOS ]; then #endif int main() { static __thread int tls; + (void)tls; } EOF if [ "$?" = 0 ]; then From f82e693a31d07ab8b391888ff60eb7ff5b95bd13 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 16 May 2019 15:20:19 -0700 Subject: [PATCH 039/572] RangeDelAggregator::StripeRep::Invalidate() to be skipped if empty (#5312) Summary: RangeDelAggregator::StripeRep::Invalidate() clears up several vectors. If we know there isn't anything to there, we can safe these small CPUs. Profiling shows that it sometimes take non-negligible amount of CPU. Worth a small optimization. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5312 Differential Revision: D15380511 Pulled By: siying fbshipit-source-id: 53c5f34c33b4cb1e743643c6086ac56d0b84ec2e --- db/range_del_aggregator.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index e593807d548..ce7897a975a 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -320,8 +320,10 @@ class RangeDelAggregator { RangeDelPositioningMode mode); void Invalidate() { - InvalidateForwardIter(); - InvalidateReverseIter(); + if (!IsEmpty()) { + InvalidateForwardIter(); + InvalidateReverseIter(); + } } bool IsRangeOverlapped(const Slice& start, const Slice& end); From c71f5bb9aa7fd2f12533a5b8300949e7f766e213 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Thu, 16 May 2019 15:35:28 -0700 Subject: [PATCH 040/572] Disable WriteUnPrepared stress tests (#5315) Summary: They are kind of flaky at the moment. Will re-enable it when flakiness is fixed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5315 Differential Revision: D15382744 Pulled By: maysamyabandeh fbshipit-source-id: 8b2f9d81a4bb34bfd51481727a682d5cd063c5e3 --- utilities/transactions/transaction_test.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 6c71b679d60..6ea1fc70213 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -79,11 +79,7 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), - std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), - std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false), - std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true), - std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false), - std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true))); + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true))); #endif // ROCKSDB_VALGRIND_RUN TEST_P(TransactionTest, DoubleEmptyWrite) { From a13026fb2fa45a1cc0f03f5e426035088f394c0a Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Thu, 16 May 2019 20:18:33 -0700 Subject: [PATCH 041/572] Added trace replay fast forward function (#5273) Summary: In the current db_bench trace replay, the replay process strictly follows the timestamp to issue the queries. In some cases, user does not care about the time. Therefore, fast forward is needed for users to speed up the replay process. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5273 Differential Revision: D15389232 Pulled By: zhichao-cao fbshipit-source-id: 735d629b9d2a167b05af3e4fa0ddf9d5d0be1806 --- tools/db_bench_tool.cc | 5 +++++ util/trace_replay.cc | 15 ++++++++++++++- util/trace_replay.h | 2 ++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b806fff8980..18d8733439b 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -762,6 +762,9 @@ DEFINE_bool(use_stderr_info_logger, false, DEFINE_string(trace_file, "", "Trace workload to a file. "); +DEFINE_int32(trace_replay_fast_forward, 1, + "Fast forward trace replay, must >= 1. "); + static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -6163,6 +6166,8 @@ void VerifyDBFromDB(std::string& truth_db_name) { } Replayer replayer(db_with_cfh->db, db_with_cfh->cfh, std::move(trace_reader)); + replayer.SetFastForward( + static_cast(FLAGS_trace_replay_fast_forward)); s = replayer.Replay(); if (s.ok()) { fprintf(stdout, "Replay started from trace_file: %s\n", diff --git a/util/trace_replay.cc b/util/trace_replay.cc index 28160b29292..c90fef2eff8 100644 --- a/util/trace_replay.cc +++ b/util/trace_replay.cc @@ -155,10 +155,22 @@ Replayer::Replayer(DB* db, const std::vector& handles, for (ColumnFamilyHandle* cfh : handles) { cf_map_[cfh->GetID()] = cfh; } + fast_forward_ = 1; } Replayer::~Replayer() { trace_reader_.reset(); } +Status Replayer::SetFastForward(uint32_t fast_forward) { + Status s; + if (fast_forward < 1) { + s = Status::InvalidArgument("Wrong fast forward speed!"); + } else { + fast_forward_ = fast_forward; + s = Status::OK(); + } + return s; +} + Status Replayer::Replay() { Status s; Trace header; @@ -182,7 +194,8 @@ Status Replayer::Replay() { } std::this_thread::sleep_until( - replay_epoch + std::chrono::microseconds(trace.ts - header.ts)); + replay_epoch + + std::chrono::microseconds((trace.ts - header.ts) / fast_forward_)); if (trace.type == kTraceWrite) { WriteBatch batch(trace.payload); db_->Write(woptions, &batch); diff --git a/util/trace_replay.h b/util/trace_replay.h index 749ea2f6432..29c00c287b2 100644 --- a/util/trace_replay.h +++ b/util/trace_replay.h @@ -88,6 +88,7 @@ class Replayer { ~Replayer(); Status Replay(); + Status SetFastForward(uint32_t fast_forward); private: Status ReadHeader(Trace* header); @@ -97,6 +98,7 @@ class Replayer { DBImpl* db_; std::unique_ptr trace_reader_; std::unordered_map cf_map_; + uint32_t fast_forward_; }; } // namespace rocksdb From f3a7847598d89ef8f9f531b10fabb7ce044a38f8 Mon Sep 17 00:00:00 2001 From: yiwu-arbug Date: Fri, 17 May 2019 10:23:38 -0700 Subject: [PATCH 042/572] Reduce iterator key comparison for upper/lower bound check (#5111) Summary: Previously if iterator upper/lower bound presents, `DBIter` will check the bound for every key. This patch turns the check into per-file or per-data block check when applicable, by checking against either file largest/smallest key or block index key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5111 Differential Revision: D15330061 Pulled By: siying fbshipit-source-id: 8a653fe3cd50d94d81eb2d13b087326c58ee2024 --- HISTORY.md | 1 + db/db_iter.cc | 4 ++-- db/version_set.cc | 40 +++++++++++++++++++++++-------- table/block_based_table_reader.cc | 20 +++++++++------- table/block_based_table_reader.h | 9 ++++++- table/internal_iterator.h | 25 +++++++++++++++++-- table/iterator_wrapper.h | 22 +++++++++++++---- table/merging_iterator.cc | 24 +++++++++++++++++++ 8 files changed, 117 insertions(+), 28 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9cf8a88da04..d45e94bb670 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases +* Reduce iterator key comparision for upper/lower bound check. ### Bug Fixes diff --git a/db/db_iter.cc b/db/db_iter.cc index 1d8ccf9adbd..a606e3acd66 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,7 +467,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); - if (iterate_upper_bound_ != nullptr && + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -859,7 +859,7 @@ void DBIter::PrevInternal() { return; } - if (iterate_lower_bound_ != nullptr && + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. diff --git a/db/version_set.cc b/db/version_set.cc index f0dfe765871..03c5902728c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } + Slice value() const override { assert(Valid()); return file_iter_.value(); } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } + bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } + bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); @@ -954,12 +969,16 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } + may_be_out_of_lower_bound_ = + read_options_.iterate_lower_bound != nullptr && + user_comparator_.Compare(ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, - nullptr /* don't need reference to table */, - file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_, smallest_compaction_key, largest_compaction_key); + nullptr /* don't need reference to table */, file_read_hist_, + for_compaction_, nullptr /* arena */, skip_filters_, level_, + smallest_compaction_key, largest_compaction_key); } TableCache* table_cache_; @@ -975,6 +994,7 @@ class LevelIterator final : public InternalIterator { bool should_sample_; bool for_compaction_; bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1043,11 +1063,12 @@ void LevelIterator::SeekToLast() { void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(Slice* ret_key) { +bool LevelIterator::NextAndGetResult(IterateResult* result) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -4278,10 +4299,9 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, - next_file_number_.load(), last_sequence_.load(), log_number, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 576117f0d35..34e40979247 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2446,11 +2446,12 @@ void BlockBasedTableIterator::Next() { template bool BlockBasedTableIterator::NextAndGetResult( - Slice* ret_key) { + IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -2531,6 +2532,11 @@ void BlockBasedTableIterator::InitDataBlock() { key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; + if (read_options_.iterate_upper_bound != nullptr) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } } } @@ -2543,13 +2549,9 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = false; - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - next_block_is_out_of_bound = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) <= 0); - } + bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 385e50ab79f..8274f0cf965 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -588,7 +588,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && block_iter_points_to_real_block_ && @@ -619,6 +619,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } @@ -680,6 +685,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { TBlockIter block_iter_; bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8f1cc9dd68e..1f57399c7f7 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,6 +17,11 @@ namespace rocksdb { class PinnedIteratorsManager; +struct IterateResult { + Slice key; + bool may_be_out_of_upper_bound; +}; + template class InternalIteratorBase : public Cleanable { public: @@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - virtual bool NextAndGetResult(Slice* ret_key) { + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when MayBeOutOfUpperBound() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // it should also override NextAndGetResult(). + result->may_be_out_of_upper_bound = true; + assert(MayBeOutOfUpperBound()); } return is_valid; } @@ -94,6 +108,13 @@ class InternalIteratorBase : public Cleanable { // upper bound virtual bool IsOutOfBound() { return false; } + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // Keys return from this iterator can be larger or equal to + // iterate_upper_bound. + virtual bool MayBeOutOfUpperBound() { return true; } + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a570e53c1e2..a5aa5c49eac 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,7 +56,10 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } + Slice key() const { + assert(Valid()); + return result_.key; + } TValue value() const { assert(Valid()); return iter_->value(); @@ -65,7 +68,7 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&key_); + valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } @@ -83,6 +86,16 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() { + assert(Valid()); + return result_.may_be_out_of_upper_bound; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -100,14 +113,15 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { - key_ = iter_->key(); assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; + IterateResult result_; bool valid_; - Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index e5df6bdf6f0..244b5e82c3d 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; + } + void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator { return current_->value(); } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() override { + assert(Valid()); + return current_->MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) { From fb4c6a31cece73f79a05135c2821d511cd76aeba Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 17 May 2019 19:16:51 -0700 Subject: [PATCH 043/572] Log replay integration for secondary instance (#5305) Summary: RocksDB secondary can replay both MANIFEST and WAL now. On the one hand, the memory usage by memtables will grow after replaying WAL for sometime. On the other hand, replaying the MANIFEST can bring the database persistent data to a more recent point in time, giving us the opportunity to discard some memtables containing out-dated data. This PR coordinates the MANIFEST and WAL replay, using the updates from MANIFEST replay to update the active memtable and immutable memtable list of each column family. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5305 Differential Revision: D15386512 Pulled By: riversand963 fbshipit-source-id: a3ea6fc415f8382d8cf624f52a71ebdcffa3e355 --- HISTORY.md | 1 + db/db_impl.h | 4 +- db/db_impl_secondary.cc | 119 +++++++++++++++++++++++++++--------- db/db_impl_secondary.h | 102 ++++++++++++++++++++++++++----- db/db_secondary_test.cc | 130 ++++++++++++++++++++++++++++++++++++++++ db/memtable_list.cc | 18 ++++++ db/memtable_list.h | 7 +++ 7 files changed, 336 insertions(+), 45 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d45e94bb670..f67a8210d24 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees. +* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. diff --git a/db/db_impl.h b/db/db_impl.h index c4fae9a6ad5..08cb1949118 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -1078,8 +1078,8 @@ class DBImpl : public DB { JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); // REQUIRES: log_numbers are sorted in ascending order - virtual Status RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* next_sequence, bool read_only); + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only); // The following two methods are used to flush a memtable to // storage. The first one is used at database RecoveryTime (when the diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 007910ea5b4..5dfa2d0c942 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -18,7 +18,6 @@ namespace rocksdb { #ifndef ROCKSDB_LITE - DBImplSecondary::DBImplSecondary(const DBOptions& db_options, const std::string& dbname) : DBImpl(db_options, dbname) { @@ -35,6 +34,7 @@ Status DBImplSecondary::Recover( bool /*error_if_data_exists_in_logs*/) { mutex_.AssertHeld(); + JobContext job_context(0); Status s; s = static_cast(versions_.get()) ->Recover(column_families, &manifest_reader_, &manifest_reporter_, @@ -59,11 +59,29 @@ Status DBImplSecondary::Recover( single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; - s = FindAndRecoverLogFiles(); + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); } // TODO: update options_file_number_ needed? + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); + } return s; } @@ -151,7 +169,10 @@ Status DBImplSecondary::MaybeInitLogReader( // REQUIRES: log_numbers are sorted in ascending order Status DBImplSecondary::RecoverLogFiles( const std::vector& log_numbers, SequenceNumber* next_sequence, - bool /*read_only*/) { + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); mutex_.AssertHeld(); Status status; for (auto log_number : log_numbers) { @@ -184,6 +205,39 @@ Status DBImplSecondary::RecoverLogFiles( continue; } WriteBatchInternal::SetContents(&batch, record); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + SequenceNumber seq = versions_->LastSequence(); + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + auto curr_log_num = port::kMaxUint64; + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq); + cfd->mem()->SetNextLogNumber(log_number); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + } // do not check sequence number because user may toggle disableWAL // between writes which breaks sequence number continuity guarantee @@ -194,12 +248,30 @@ Status DBImplSecondary::RecoverLogFiles( // That's why we set ignore missing column families to true // passing null flush_scheduler will disable memtable flushing which is // needed for secondary instances - bool has_valid_writes = false; - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), nullptr /* flush_scheduler */, - true, log_number, this, false /* concurrent_memtable_writes */, - next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); - if (!status.ok()) { + if (status.ok()) { + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, true, log_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); + } + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + } else { // We are treating this as a failure while reading since we read valid // blocks that do not form coherent data reader->GetReporter()->Corruption(record.size(), status); @@ -296,18 +368,6 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, return s; } -// find new WAL and apply them in order to the secondary instance -Status DBImplSecondary::FindAndRecoverLogFiles() { - Status s; - std::vector logs; - s = FindNewLogNumbers(&logs); - if (s.ok() && !logs.empty()) { - SequenceNumber next_sequence(kMaxSequenceNumber); - s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/); - } - return s; -} - Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { if (read_options.managed) { @@ -393,20 +453,25 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { Status s; // read the manifest and apply new changes to the secondary instance std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); InstrumentedMutexLock lock_guard(&mutex_); s = static_cast(versions_.get()) ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } if (s.ok()) { - SuperVersionContext sv_context(true /* create_superversion */); for (auto cfd : cfds_changed) { - sv_context.NewSuperVersion(); + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); } - sv_context.Clean(); + job_context.Clean(); } - // list wal_dir to discover new WALs and apply new changes to the secondary - // instance - s = FindAndRecoverLogFiles(); return s; } diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 32dbae058b8..912708b1ec0 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -96,40 +96,40 @@ class DBImplSecondary : public DBImpl { Status Put(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Merge; Status Merge(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Delete; Status Delete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::SingleDelete; Status SingleDelete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::CompactRange; Status CompactRange(const CompactRangeOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice* /*begin*/, const Slice* /*end*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::CompactFiles; @@ -140,32 +140,32 @@ class DBImplSecondary : public DBImpl { const int /*output_level*/, const int /*output_path_id*/ = -1, std::vector* const /*output_file_names*/ = nullptr, CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status DisableFileDeletions() override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status EnableFileDeletions(bool /*force*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status GetLiveFiles(std::vector&, uint64_t* /*manifest_file_size*/, bool /*flush_memtable*/ = true) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Flush; Status Flush(const FlushOptions& /*options*/, ColumnFamilyHandle* /*column_family*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::SyncWAL; Status SyncWAL() override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DB::IngestExternalFile; @@ -173,7 +173,7 @@ class DBImplSecondary : public DBImpl { ColumnFamilyHandle* /*column_family*/, const std::vector& /*external_files*/, const IngestExternalFileOptions& /*ingestion_options*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } // Try to catch up with the primary by reading as much as possible from the @@ -185,6 +185,70 @@ class DBImplSecondary : public DBImpl { Status MaybeInitLogReader(uint64_t log_number, log::FragmentBufferedReader** log_reader); + protected: + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + private: friend class DB; @@ -194,19 +258,25 @@ class DBImplSecondary : public DBImpl { using DBImpl::Recover; - Status FindAndRecoverLogFiles(); + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); Status FindNewLogNumbers(std::vector* logs); Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, - bool read_only) override; + std::unordered_set* cfds_changed, + JobContext* job_context); std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; - // cache log readers for each log number, used for continue WAL replay + // Cache log readers for each log number, used for continue WAL replay // after recovery std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; }; } // namespace rocksdb diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 60ea5ba8d5f..a4267c7d596 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -243,6 +243,11 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); } TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { @@ -519,6 +524,131 @@ TEST_F(DBSecondaryTest, SwitchManifest) { ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); range_scan_db(); } + +TEST_F(DBSecondaryTest, SwitchWAL) { + const int kNumKeysPerMemtable = 1; + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + ASSERT_OK( + Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK( + Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + wb.Put("key0", "value0"); + wb.Put("key1", "value1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + wb1.Put("key0", "value01"); + wb1.Put("key1", "value11"); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + wb2.Put("key0", "new_value0"); + wb2.Delete("key1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); +} #endif //! ROCKSDB_LITE } // namespace rocksdb diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 21b44b1798a..d81b1d4d224 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -638,4 +638,22 @@ Status InstallMemtableAtomicFlushResults( return s; } +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } +} + } // namespace rocksdb diff --git a/db/memtable_list.h b/db/memtable_list.h index b56ad4932c4..5df35660a4d 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -294,6 +294,13 @@ class MemTableList { } } + // Used only by DBImplSecondary during log replay. + // Remove memtables whose data were written before the WAL with log_number + // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are + // not freed, but put into a vector for future deref and reclamation. + void RemoveOldMemTables(uint64_t log_number, + autovector* to_delete); + private: friend Status InstallMemtableAtomicFlushResults( const autovector* imm_lists, From 5c0e304170dbb157f9faa612f0568f37ad506674 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 20 May 2019 07:46:15 -0700 Subject: [PATCH 044/572] WritePrepared: Clarify the need for two_write_queues in unordered_write (#5313) Summary: WritePrepared transactions when configured with two_write_queues=true offers higher throughput with unordered_write feature without however compromising the rocksdb guarantees. This is because it performs ordering among writes in a 2nd step that is not tied to memtable write speed. The 2nd step is naturally provided by 2PC when the commit phase does the ordering as well. Without 2PC, the 2nd step would only be provided when we use two_write_queues=true, where WritePrepared after performing the writes, in a 2nd step uses the 2nd queue to assign order to the writes. The patch clarifies the need for two_write_queues=true in the HISTORY and inline comments of unordered_writes. Moreover it extends the stress tests of WritePrepared to unordred_write. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5313 Differential Revision: D15379977 Pulled By: maysamyabandeh fbshipit-source-id: 5b6f05b9b59285dcbf3b0532215ba9fe7d926e00 --- HISTORY.md | 2 +- db/db_impl_write.cc | 5 +++ include/rocksdb/options.h | 5 ++- .../pessimistic_transaction_db.cc | 6 +++ utilities/transactions/transaction_test.cc | 6 +-- .../write_prepared_transaction_test.cc | 38 +------------------ 6 files changed, 20 insertions(+), 42 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index f67a8210d24..44fc66bcbd8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,7 +5,7 @@ ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. -* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees. +* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. ### Performance Improvements diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index f4c72e298ee..92edc84254c 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -605,6 +605,11 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); switch_cv_.notify_all(); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index c8b4cc538d9..7d22fb67559 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -899,8 +899,9 @@ struct DBOptions { // ::MultiGet and Iterator's consistent-point-in-time view property. // If the application cannot tolerate the relaxed guarantees, it can implement // its own mechanisms to work around that and yet benefit from the higher - // throughput. Using TransactionDB with WRITE_PREPARED write policy is one way - // to achieve immutable snapshots despite unordered_write. + // throughput. Using TransactionDB with WRITE_PREPARED write policy and + // two_write_queues=true is one way to achieve immutable snapshots despite + // unordered_write. // // By default, i.e., when it is false, rocksdb does not advance the sequence // number for new snapshots unless all the writes with lower sequence numbers diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index ecf6d2ff387..c4e6e247756 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -232,6 +232,12 @@ Status TransactionDB::Open( return Status::NotSupported( "WRITE_UNPREPARED is currently incompatible with unordered_writes"); } + if (txn_db_options.write_policy == WRITE_PREPARED && + db_options.unordered_write && !db_options.two_write_queues) { + return Status::NotSupported( + "WRITE_UNPREPARED is incompatible with unordered_writes if " + "two_write_queues is not enabled."); + } std::vector column_families_copy = column_families; std::vector compaction_enabled_cf_indices; diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 6ea1fc70213..3c8036614f0 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -47,7 +47,6 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); @@ -58,7 +57,6 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); @@ -79,7 +77,9 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), - std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true))); + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true))); #endif // ROCKSDB_VALGRIND_RUN TEST_P(TransactionTest, DoubleEmptyWrite) { diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 6bad81db0ee..b93f1a74ffe 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -573,7 +573,6 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite), std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite))); #ifndef ROCKSDB_VALGRIND_RUN @@ -644,29 +643,7 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20), - std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20), - - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 10, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 11, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 12, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 13, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 14, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 15, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 16, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 17, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 18, 20), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 19, - 20))); + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20))); INSTANTIATE_TEST_CASE_P( TwoWriteQueues, SeqAdvanceConcurrentTest, @@ -704,18 +681,7 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10), std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10), - std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10), - - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 10), - std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 10))); + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10))); #endif // ROCKSDB_VALGRIND_RUN TEST_P(WritePreparedTransactionTest, CommitMapTest) { From 931c9df88677bcb6935eb353e79085790b79c8d4 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli <1004951+vjnadimpalli@users.noreply.github.com> Date: Mon, 20 May 2019 10:37:37 -0700 Subject: [PATCH 045/572] Use separate status code for column family drop and db shutdown in progress (#5275) Summary: Currently RocksDB uses Status::ShutdownInProgress to inform about column family drop. I would like to have a separate Status code for this event. https://github.com/facebook/rocksdb/blob/master/include/rocksdb/status.h#L55 Comment on this: https://github.com/facebook/rocksdb/blob/abc4202e47eb433dc731911af38f232d2148428c/db/version_set.cc#L2742:L2743 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5275 Differential Revision: D15204583 Pulled By: vjnadimpalli fbshipit-source-id: 95e99e34b27bc165b554ecb8a48a7f8e60f21e2a --- HISTORY.md | 4 ++++ db/compaction_job.cc | 11 ++++++---- db/db_compaction_test.cc | 12 ++++++++--- db/db_flush_test.cc | 2 +- db/db_impl_compaction_flush.cc | 37 ++++++++++++++++++---------------- db/flush_job.cc | 10 +++++---- db/memtable_list.cc | 2 +- db/version_set.cc | 4 +--- include/rocksdb/status.h | 16 ++++++++++++++- java/rocksjni/portal.h | 8 ++++++++ util/status.cc | 3 +++ 11 files changed, 75 insertions(+), 34 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 44fc66bcbd8..e9f06b53280 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,8 +14,12 @@ * Merging iterator to avoid child iterator reseek for some cases * Reduce iterator key comparision for upper/lower bound check. +### General Improvements +* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. + ### Bug Fixes + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/compaction_job.cc b/db/compaction_job.cc index fb77431fddc..d1ae1932729 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1004,10 +1004,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); RecordCompactionIOStats(); - if (status.ok() && - (shutting_down_->load(std::memory_order_relaxed) || cfd->IsDropped())) { - status = Status::ShutdownInProgress( - "Database shutdown or Column family drop during compaction"); + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); } if (status.ok()) { status = input->status(); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index b5033b66f0c..91a04205e07 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -3890,11 +3890,17 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { } Flush(1); } - auto manual_compaction_thread = port::Thread([this]() { + auto manual_compaction_thread = port::Thread([this, i]() { CompactRangeOptions cro; cro.allow_write_stall = false; - ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) - .IsShutdownInProgress()); + Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr); + if (i == 0) { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsColumnFamilyDropped()); + } else { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + } }); TEST_SYNC_POINT( diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index c603f60b460..876605b2e48 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -431,7 +431,7 @@ TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) { cf_ids.push_back(cf_id); } ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress()); + ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped()); Destroy(options); } diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 900ea4acdcd..38c69dfc1e4 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -201,7 +201,7 @@ Status DBImpl::FlushMemTableToOutputFile( cfd->current()->storage_info()->LevelSummary(&tmp)); } - if (!s.ok() && !s.IsShutdownInProgress()) { + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } @@ -254,7 +254,7 @@ Status DBImpl::FlushMemTablesToOutputFiles( snapshot_checker, log_buffer, thread_pri); if (!s.ok()) { status = s; - if (!s.IsShutdownInProgress()) { + if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { // At this point, DB is not shutting down, nor is cfd dropped. // Something is wrong, thus we break out of the loop. break; @@ -385,7 +385,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( for (const auto& e : exec_status) { if (!e.second.ok()) { s = e.second; - if (!e.second.IsShutdownInProgress()) { + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { // If a flush job did not return OK, and the CF is not dropped, and // the DB is not shutting down, then we have to return this result to // caller later. @@ -397,15 +398,11 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( s = error_status.ok() ? s : error_status; } - // If db is NOT shutting down, and one or more column families have been - // dropped. - // TODO: use separate status code for db shutdown and column family dropped. - if (s.IsShutdownInProgress() && - !shutting_down_.load(std::memory_order_acquire)) { + if (s.IsColumnFamilyDropped()) { s = Status::OK(); } - if (s.ok() || s.IsShutdownInProgress()) { + if (s.ok() || s.IsShutdownInProgress() || s.IsColumnFamilyDropped()) { // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { @@ -523,7 +520,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Need to undo atomic flush if something went wrong, i.e. s is not OK and // it is not because of CF drop. - if (!s.ok() && !s.IsShutdownInProgress()) { + if (!s.ok() && !s.IsColumnFamilyDropped()) { // Have to cancel the flush jobs that have NOT executed because we need to // unref the versions. for (int i = 0; i != num_cfs; ++i) { @@ -1052,7 +1049,7 @@ Status DBImpl::CompactFilesImpl( if (status.ok()) { // Done - } else if (status.IsShutdownInProgress()) { + } else if (status.IsColumnFamilyDropped()) { // Ignore compaction errors found during shutting down } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, @@ -1697,7 +1694,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, cfd->GetName().c_str()); bg_cv_.Wait(); } - if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) { + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } @@ -2159,7 +2159,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason, thread_pri); - if (!s.ok() && !s.IsShutdownInProgress() && + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && reason != FlushReason::kErrorRecovery) { // Wait a little bit before retrying background flush in // case this is an environmental problem and we do not want to @@ -2184,7 +2184,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { // If flush failed, we want to delete all temporary files that we might have // created. Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); // delete unnecessary files if any, this is done outside the mutex if (job_context.HaveSomethingToClean() || job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { @@ -2248,7 +2249,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, mutex_.Unlock(); env_->SleepForMicroseconds(10000); // prevent hot loop mutex_.Lock(); - } else if (!s.ok() && !s.IsShutdownInProgress()) { + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of @@ -2272,7 +2274,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, // If compaction failed, we want to delete all temporary files that we might // have created (they might not be all recorded in job_context in case of a // failure). Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); // delete unnecessary files if any, this is done outside the mutex @@ -2710,7 +2713,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (status.ok() || status.IsCompactionTooLarge()) { // Done - } else if (status.IsShutdownInProgress()) { + } else if (status.IsColumnFamilyDropped()) { // Ignore compaction errors found during shutting down } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", diff --git a/db/flush_job.cc b/db/flush_job.cc index 4226589e79d..21c1ff3a746 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -229,10 +229,12 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, // This will release and re-acquire the mutex. Status s = WriteLevel0Table(); - if (s.ok() && - (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) { - s = Status::ShutdownInProgress( - "Database shutdown or Column family drop during flush"); + if (s.ok() && cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((s.ok() || s.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); } if (!s.ok()) { diff --git a/db/memtable_list.cc b/db/memtable_list.cc index d81b1d4d224..bdcbd218663 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -598,7 +598,7 @@ Status InstallMemtableAtomicFlushResults( imm->InstallNewVersion(); } - if (s.ok() || s.IsShutdownInProgress()) { + if (s.ok() || s.IsColumnFamilyDropped()) { for (size_t i = 0; i != cfds.size(); ++i) { if (cfds[i]->IsDropped()) { continue; diff --git a/db/version_set.cc b/db/version_set.cc index 03c5902728c..15b9d01feea 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3842,8 +3842,6 @@ Status VersionSet::LogAndApply( } } if (0 == num_undropped_cfds) { - // TODO (yanqin) maybe use a different status code to denote column family - // drop other than OK and ShutdownInProgress for (int i = 0; i != num_cfds; ++i) { manifest_writers_.pop_front(); } @@ -3851,7 +3849,7 @@ Status VersionSet::LogAndApply( if (!manifest_writers_.empty()) { manifest_writers_.front()->cv.Signal(); } - return Status::ShutdownInProgress(); + return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log, diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 12e8070d1e8..ac97ce442af 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -58,7 +58,9 @@ class Status { kBusy = 11, kExpired = 12, kTryAgain = 13, - kCompactionTooLarge = 14 + kCompactionTooLarge = 14, + kColumnFamilyDropped = 15, + kMaxCode }; Code code() const { return code_; } @@ -184,6 +186,15 @@ class Status { return Status(kCompactionTooLarge, msg, msg2); } + static Status ColumnFamilyDropped(SubCode msg = kNone) { + return Status(kColumnFamilyDropped, msg); + } + + static Status ColumnFamilyDropped(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kColumnFamilyDropped, msg, msg2); + } + static Status NoSpace() { return Status(kIOError, kNoSpace); } static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kIOError, kNoSpace, msg, msg2); @@ -256,6 +267,9 @@ class Status { // Returns true iff the status indicates the proposed compaction is too large bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; } + // Returns true iff the status indicates Column Family Dropped + bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; } + // Returns true iff the status indicates a NoSpace error // This is caused by an I/O error returning the specific "out of space" // error condition. Stricto sensu, an NoSpace error is an I/O error diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 193804ac318..d1585fcfa80 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -467,6 +467,8 @@ class StatusJni : public RocksDBNativeClass { return 0xC; case rocksdb::Status::Code::kTryAgain: return 0xD; + case rocksdb::Status::Code::kColumnFamilyDropped: + return 0xE; default: return 0x7F; // undefined } @@ -584,6 +586,12 @@ class StatusJni : public RocksDBNativeClass { new rocksdb::Status(rocksdb::Status::TryAgain( rocksdb::SubCodeJni::toCppSubCode(jsub_code_value)))); break; + case 0xE: + // ColumnFamilyDropped + status = std::unique_ptr( + new rocksdb::Status(rocksdb::Status::ColumnFamilyDropped( + rocksdb::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; case 0x7F: default: return nullptr; diff --git a/util/status.cc b/util/status.cc index c66bf6f8e16..9405944808d 100644 --- a/util/status.cc +++ b/util/status.cc @@ -109,6 +109,9 @@ std::string Status::ToString() const { case kTryAgain: type = "Operation failed. Try again.: "; break; + case kColumnFamilyDropped: + type = "Column family dropped: "; + break; default: snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast(code())); From cd43446d017fd3929e5883bccf1206afafd57952 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 20 May 2019 13:47:32 -0700 Subject: [PATCH 046/572] Improve DBTablePropertiesTest.GetPropertiesOfTablesInRange (#5302) Summary: DBTablePropertiesTest.GetPropertiesOfTablesInRange sometimes hits the assert that generated LSM-tree doesn't have L1 file. Tighten the compaction triggering condition even further, hoping it goes away. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5302 Differential Revision: D15325971 Pulled By: siying fbshipit-source-id: 3e032bdb16fe8d98d5fcfcd65dd8be9781f3d6ae --- db/db_table_properties_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 5a54fd81c05..77ea0020dd6 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -139,12 +139,12 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { Options options; options.create_if_missing = true; options.write_buffer_size = 4096; - options.max_write_buffer_number = 3; + options.max_write_buffer_number = 2; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; + options.level0_stop_writes_trigger = 2; options.target_file_size_base = 2048; - options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_base = 40960; options.max_bytes_for_level_multiplier = 4; options.hard_pending_compaction_bytes_limit = 16 * 1024; options.num_levels = 8; From b2274da0e54da2a4c7faac571377edd8ece43cec Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Tue, 21 May 2019 12:17:15 -0700 Subject: [PATCH 047/572] LogWriter to only flush after finish generating whole record (#5328) Summary: Right now, in log writer, we call flush after writing each physical record. I don't see the necessarity of it. Right now, the underlying writer has a buffer, so there isn't a concern that the write request is too large either. On the other hand, in an Env where every flush is expensive, the current approach is significantly slower than only flushing after a whole record finishes, when the record is very large. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5328 Differential Revision: D15425032 Pulled By: siying fbshipit-source-id: 440ebef002dfbb60c59d8388c9ddfc83d79700aa --- HISTORY.md | 1 + db/log_writer.cc | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index e9f06b53280..b65f5a038b1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,6 +13,7 @@ * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases * Reduce iterator key comparision for upper/lower bound check. +* Log Writer will flush after finishing the whole record, rather than a fragment. ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. diff --git a/db/log_writer.cc b/db/log_writer.cc index 6ee39198184..c46965e16e0 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -102,6 +102,13 @@ Status Writer::AddRecord(const Slice& slice) { left -= fragment_length; begin = false; } while (s.ok() && left > 0); + + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(); + } + } + return s; } @@ -146,11 +153,6 @@ Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { Status s = dest_->Append(Slice(buf, header_size)); if (s.ok()) { s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - if (!manual_flush_) { - s = dest_->Flush(); - } - } } block_offset_ += header_size + n; return s; From dda474399affd9042c237ed3ab47a5a3e8a83c92 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Tue, 21 May 2019 16:19:39 -0700 Subject: [PATCH 048/572] Remove PATENTS text from a few straggler files (#5326) Summary: Remove PATENTS related wording from a few stragglers which still reference the old PATENTS file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5326 Differential Revision: D15423297 Pulled By: sagar0 fbshipit-source-id: 4babcddfc120b7d2fed6eb3898287cf8012bf8ea --- port/win/win_jemalloc.cc | 6 +++--- util/crc32c_ppc.c | 8 +++----- util/crc32c_ppc.h | 8 +++----- util/crc32c_ppc_asm.S | 8 +++----- util/crc32c_ppc_constants.h | 8 +++----- util/ppc-opcode.h | 8 +++----- 6 files changed, 18 insertions(+), 28 deletions(-) diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc index 3268a56affd..b2077938806 100644 --- a/port/win/win_jemalloc.cc +++ b/port/win/win_jemalloc.cc @@ -1,7 +1,7 @@ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c index d9467d28262..654d606aaad 100644 --- a/util/crc32c_ppc.c +++ b/util/crc32c_ppc.c @@ -1,11 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // Copyright (c) 2017 International Business Machines Corp. // All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// This source code is also licensed under the GPLv2 license found in the -// COPYING file in the root directory of this source tree. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #define CRC_TABLE #include diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h index 64a81a43102..c359061c610 100644 --- a/util/crc32c_ppc.h +++ b/util/crc32c_ppc.h @@ -1,11 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // Copyright (c) 2017 International Business Machines Corp. // All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// This source code is also licensed under the GPLv2 license found in the -// COPYING file in the root directory of this source tree. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #pragma once diff --git a/util/crc32c_ppc_asm.S b/util/crc32c_ppc_asm.S index 5142a8f259b..a317bf96b87 100644 --- a/util/crc32c_ppc_asm.S +++ b/util/crc32c_ppc_asm.S @@ -2,11 +2,9 @@ // Copyright (c) 2015 Anton Blanchard , IBM // Copyright (c) 2017 International Business Machines Corp. // All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// This source code is also licensed under the GPLv2 license found in the -// COPYING file in the root directory of this source tree. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #include #include "ppc-opcode.h" diff --git a/util/crc32c_ppc_constants.h b/util/crc32c_ppc_constants.h index 21ec6fd9458..f6494cd01c3 100644 --- a/util/crc32c_ppc_constants.h +++ b/util/crc32c_ppc_constants.h @@ -1,11 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // Copyright (C) 2015, 2017 International Business Machines Corp. // All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// This source code is also licensed under the GPLv2 license found in the -// COPYING file in the root directory of this source tree. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #pragma once diff --git a/util/ppc-opcode.h b/util/ppc-opcode.h index e632ef26a3c..5cc5af0e30c 100644 --- a/util/ppc-opcode.h +++ b/util/ppc-opcode.h @@ -1,11 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // Copyright (c) 2017 International Business Machines Corp. // All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// This source code is also licensed under the GPLv2 license found in the -// COPYING file in the root directory of this source tree. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #pragma once From 518cd1a62aeaaa9584516fdcf81bbfafbd75f18c Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Wed, 22 May 2019 09:17:39 -0700 Subject: [PATCH 049/572] Use GetCurrentManifestPath to locate current MANIFEST file (#5331) Summary: In version_set.cc, there is a function GetCurrentManifestPath. The goal of this task is to refactor ListColumnFamilies function so that ListColumnFamilies calls GetCurrentManifestPath to search for MANIFEST. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5331 Differential Revision: D15444524 Pulled By: HaoyuHuang fbshipit-source-id: 1dcbd030bc0f2e835695741f450bba150f2f2903 --- db/version_set.cc | 37 ++++++++++++++++++++----------------- db/version_set.h | 4 +++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 15b9d01feea..5723c6d9253 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4041,10 +4041,15 @@ Status VersionSet::ExtractInfoFromVersionEdit( return Status::OK(); } -Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { +Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env, + std::string* manifest_path, + uint64_t* manifest_file_number) { + assert(env != nullptr); assert(manifest_path != nullptr); + assert(manifest_file_number != nullptr); + std::string fname; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname); + Status s = ReadFileToString(env, CurrentFileName(dbname), &fname); if (!s.ok()) { return s; } @@ -4054,12 +4059,12 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { // remove the trailing '\n' fname.resize(fname.size() - 1); FileType type; - bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type); + bool parse_ok = ParseFileName(fname, manifest_file_number, &type); if (!parse_ok || type != kDescriptorFile) { return Status::Corruption("CURRENT file corrupted"); } - *manifest_path = dbname_; - if (dbname_.back() != '/') { + *manifest_path = dbname; + if (dbname.back() != '/') { manifest_path->push_back('/'); } *manifest_path += fname; @@ -4080,7 +4085,8 @@ Status VersionSet::Recover( // Read "CURRENT" file, which contains a pointer to the current manifest file std::string manifest_path; - Status s = GetCurrentManifestPath(&manifest_path); + Status s = GetCurrentManifestPath(dbname_, env_, &manifest_path, + &manifest_file_number_); if (!s.ok()) { return s; } @@ -4321,26 +4327,22 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, // so we're fine using the defaults EnvOptions soptions; // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env, CurrentFileName(dbname), ¤t); + std::string manifest_path; + uint64_t manifest_file_number; + Status s = GetCurrentManifestPath(dbname, env, &manifest_path, + &manifest_file_number); if (!s.ok()) { return s; } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname + "/" + current; std::unique_ptr file_reader; { std::unique_ptr file; - s = env->NewSequentialFile(dscname, &file, soptions); + s = env->NewSequentialFile(manifest_path, &file, soptions); if (!s.ok()) { return s; } - file_reader.reset(new SequentialFileReader(std::move(file), dscname)); + file_reader.reset(new SequentialFileReader(std::move(file), manifest_path)); } std::map column_family_names; @@ -5510,7 +5512,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( Status s; do { std::string manifest_path; - s = GetCurrentManifestPath(&manifest_path); + s = GetCurrentManifestPath(dbname_, env_, &manifest_path, + &manifest_file_number_); std::unique_ptr manifest_file; if (s.ok()) { if (nullptr == manifest_reader->get() || diff --git a/db/version_set.h b/db/version_set.h index d82c5b47291..28ad0c2c234 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -807,7 +807,9 @@ class VersionSet { bool new_descriptor_log = false, const ColumnFamilyOptions* new_cf_options = nullptr); - Status GetCurrentManifestPath(std::string* manifest_filename); + static Status GetCurrentManifestPath(const std::string& dbname, Env* env, + std::string* manifest_filename, + uint64_t* manifest_file_number); // Recover the last saved descriptor from persistent storage. // If read_only == true, Recover() will not complain if some column families From 3d9d77d9006c246ea54656440fb29eebfa048f8b Mon Sep 17 00:00:00 2001 From: Thomas Fersch Date: Wed, 22 May 2019 23:38:09 -0700 Subject: [PATCH 050/572] Restrict L0->L0 compaction according to max_compaction_bytes option (#5329) Summary: Modified FindIntraL0Compaction to stop picking more files if total amount of compensated bytes would be larger than max_compaction_bytes option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5329 Differential Revision: D15435728 Pulled By: ThomasFersch fbshipit-source-id: d118a6da88d5df8ee20944422ade37cf6b15d60c --- db/compaction_picker.cc | 16 +++++++--- db/compaction_picker.h | 17 +++++++++++ db/compaction_picker_fifo.cc | 3 +- db/compaction_picker_test.cc | 59 ++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index d6d7b69876e..4bd8ff0e33a 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -42,19 +42,23 @@ uint64_t TotalCompensatedFileSize(const std::vector& files) { bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, CompactionInputFiles* comp_inputs) { size_t compact_bytes = static_cast(level_files[0]->fd.file_size); + uint64_t compensated_compact_bytes = level_files[0]->compensated_file_size; size_t compact_bytes_per_del_file = port::kMaxSizet; - // compaction range will be [0, span_len). + // Compaction range will be [0, span_len). size_t span_len; - // pull in files until the amount of compaction work per deleted file begins - // increasing. + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. size_t new_compact_bytes_per_del_file = 0; for (span_len = 1; span_len < level_files.size(); ++span_len) { compact_bytes += static_cast(level_files[span_len]->fd.file_size); + compensated_compact_bytes += level_files[span_len]->compensated_file_size; new_compact_bytes_per_del_file = compact_bytes / span_len; if (level_files[span_len]->being_compacted || - new_compact_bytes_per_del_file > compact_bytes_per_del_file) { + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compensated_compact_bytes > max_compaction_bytes) { break; } compact_bytes_per_del_file = new_compact_bytes_per_del_file; @@ -1627,7 +1631,9 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() { return false; } return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, - port::kMaxUint64, &start_level_inputs_); + port::kMaxUint64, + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_); } } // namespace diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 01f5495e67b..250566b1065 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -273,9 +273,26 @@ class NullCompactionPicker : public CompactionPicker { }; #endif // !ROCKSDB_LITE +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, CompactionInputFiles* comp_inputs); CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, diff --git a/db/compaction_picker_fifo.cc b/db/compaction_picker_fifo.cc index 1322989e568..eadb31f9ee5 100644 --- a/db/compaction_picker_fifo.cc +++ b/db/compaction_picker_fifo.cc @@ -134,7 +134,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( mutable_cf_options .level0_file_num_compaction_trigger /* min_files_to_compact */ , - max_compact_bytes_per_del_file, &comp_inputs)) { + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { Compaction* c = new Compaction( vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 31325c12893..c759dae8b6c 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -1478,6 +1478,65 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); } +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U); + Add(0, 2U, "151", "200", 200000U); + Add(0, 3U, "201", "250", 200000U); + Add(0, 4U, "251", "300", 200000U); + Add(0, 5U, "301", "350", 200000U); + Add(1, 6U, "100", "350", 200000U); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0U, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U); + Add(0, 2U, "151", "200", 200000U); + Add(0, 3U, "201", "250", 200000U); + Add(0, 4U, "251", "300", 200000U); + Add(0, 5U, "301", "350", 200000U); + Add(1, 6U, "100", "350", 200000U); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0U, compaction->output_level()); +} + } // namespace rocksdb int main(int argc, char** argv) { From 2095ae88585f8ee1ef24b07231f536ba583dd345 Mon Sep 17 00:00:00 2001 From: Silver Chan Date: Thu, 23 May 2019 14:00:20 -0700 Subject: [PATCH 051/572] fixed db_stress.cc build error (#5307) Summary: when building this file using Xcode 10.2.1 in MacOSX10.14, the compiler report this error: ` rocksdb/tools/db_stress.cc:3613:33: error: implicit instantiation of undefined template 'std::__1::array, 10>' std::array keys = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; /usr/include/c++/v1/__tuple:223:64: note: template is declared here template struct _LIBCPP_TEMPLATE_VIS array; ^ 1 error generated. ` if including array, this error will be fixed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5307 Differential Revision: D15475217 Pulled By: sagar0 fbshipit-source-id: b04a7658c2ca2573157028863b3a80f5ab52b9de --- tools/db_stress.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 6eb974e0934..579178efffc 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -38,6 +38,7 @@ int main() { #include #include #include +#include #include #include #include From 40aa520a51bbf5b8bae54861a7c9c433a1b40006 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 23 May 2019 14:19:12 -0700 Subject: [PATCH 052/572] Add class comment for BlockFetcher Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5337 Differential Revision: D15482289 Pulled By: ltamasi fbshipit-source-id: 8639ca78c1b8dfcc337a742d4d81d5752f12545f --- table/block_fetcher.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/table/block_fetcher.h b/table/block_fetcher.h index b5fee941597..0dcdfc76125 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -13,14 +13,28 @@ #include "util/memory_allocator.h" namespace rocksdb { + +// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or +// persistent cache provided (if any) to try to avoid reading from the file +// directly. Note that both the prefetch buffer and the persistent cache are +// optional; also, note that the persistent cache may be configured to store either +// compressed or uncompressed blocks. +// +// If the retrieved block is compressed and the do_uncompress flag is set, +// BlockFetcher uncompresses the block (using the uncompression dictionary, +// if provided, to prime the compression algorithm), and returns the resulting +// uncompressed block data. Otherwise, it returns the original block. +// +// Two read options affect the behavior of BlockFetcher: if verify_checksums is +// true, the checksum of the (original) block is checked; if fill_cache is true, +// the block is added to the persistent cache if needed. +// +// Memory for uncompressed and compressed blocks is allocated as needed +// using memory_allocator and memory_allocator_compressed, respectively +// (if provided; otherwise, the default allocator is used). + class BlockFetcher { public: - // Read the block identified by "handle" from "file". - // The only relevant option is options.verify_checksums for now. - // On failure return non-OK. - // On success fill *result and return OK - caller owns *result - // @param uncompression_dict Data for presetting the compression library's - // dictionary. BlockFetcher(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& read_options, const BlockHandle& handle, From dc30a9b69bc2c9f38e7e3266cfeb7983d2712ca4 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 23 May 2019 15:53:37 -0700 Subject: [PATCH 053/572] Add comments to db/db_iter.h (#5340) Summary: Add file comment in db/db_iter.h and minor changes in other parts. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5340 Differential Revision: D15484605 Pulled By: siying fbshipit-source-id: 173771f9d5bd51303de5410ee5afd0a4af9d6572 --- db/db_iter.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/db/db_iter.h b/db/db_iter.h index a640f0296e5..8d8af3fd292 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -21,11 +21,36 @@ namespace rocksdb { +// This file declares the factory functions of DBIter, in its original form +// or a wrapped form with class ArenaWrappedDBIter, which is defined here. +// Class DBIter, which is declared and implemented inside db_iter.cc, is +// a iterator that converts internal keys (yielded by an InternalIterator) +// that were live at the specified sequence number into appropriate user +// keys. +// Each internal key is consist of a user key, a sequence number, and a value +// type. DBIter deals with multiple key versions, tombstones, merge operands, +// etc, and exposes an Iterator. +// For example, DBIter may wrap following InternalIterator: +// user key: AAA value: v3 seqno: 100 type: Put +// user key: AAA value: v2 seqno: 97 type: Put +// user key: AAA value: v1 seqno: 95 type: Put +// user key: BBB value: v1 seqno: 90 type: Put +// user key: BBC value: N/A seqno: 98 type: Delete +// user key: BBC value: v1 seqno: 95 type: Put +// If the snapshot passed in is 102, then the DBIter is expected to +// expose the following iterator: +// key: AAA value: v3 +// key: BBB value: v1 +// If the snapshot passed in is 96, then it should expose: +// key: AAA value: v1 +// key: BBB value: v1 +// key: BBC value: v1 +// class Arena; class DBIter; // Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number +// "*internal_iter") that were live at the specified `sequence` number // into appropriate user keys. extern Iterator* NewDBIterator( Env* env, const ReadOptions& read_options, @@ -41,6 +66,8 @@ extern Iterator* NewDBIterator( // a iterator hierarchy whose memory can be allocated inline. In that way, // accessing the iterator tree can be more cache friendly. It is also faster // to allocate. +// When using the class's Iterator interface, the behavior is exactly +// the same as the inner DBIter. class ArenaWrappedDBIter : public Iterator { public: virtual ~ArenaWrappedDBIter(); From 02830a20f8673de7b332a42e4cb376f79de0b121 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 23 May 2019 16:16:38 -0700 Subject: [PATCH 054/572] Add comments in db/dbformat.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5342 Differential Revision: D15485238 Pulled By: siying fbshipit-source-id: a56b374584cb1d815c1173907a807d90b37d4dd6 --- db/dbformat.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/db/dbformat.h b/db/dbformat.h index c850adcb01a..437119fb775 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -28,6 +28,12 @@ namespace rocksdb { +// The file declares data structures and functions that deal with internal +// keys. +// Each internal key contains a user key, a sequence number (SequenceNumber) +// and a type (ValueType), and they are usually encoded together. +// There are some related helper classes here. + class InternalKey; // Value types encoded as the last component of internal keys. @@ -88,6 +94,8 @@ static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1); static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64; +// The data structure that represents an internal key in the way that user_key, +// sequence number and type are stored in separated forms. struct ParsedInternalKey { Slice user_key; SequenceNumber sequence; @@ -192,9 +200,7 @@ class InternalKeyComparator } }; -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. +// The class represent the internal key in encoded form. class InternalKey { private: std::string rep_; @@ -295,6 +301,12 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { return num >> 8; } +// The class to store keys in an efficient way. It allows: +// 1. Users can either copy the key into it, or have it point to an unowned +// address. +// 2. For copied key, a short inline buffer is kept to reduce memory +// allocation for smaller keys. +// 3. It tracks user key or internal key, and allow conversion between them. class IterKey { public: IterKey() @@ -506,6 +518,8 @@ class IterKey { void operator=(const IterKey&) = delete; }; +// Convert from a SliceTranform of user keys, to a SliceTransform of +// user keys. class InternalKeySliceTransform : public SliceTransform { public: explicit InternalKeySliceTransform(const SliceTransform* transform) @@ -631,6 +645,7 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, return r; } +// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. struct ParsedInternalKeyComparator { explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) : cmp(c) {} From 38a06aa2254ed363762c9f735df3638eb22b73b2 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 23 May 2019 16:22:13 -0700 Subject: [PATCH 055/572] Improve comments of classes for PlainTable (#5339) Summary: Simply add some comments. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5339 Differential Revision: D15485315 Pulled By: siying fbshipit-source-id: 4594b1c4c967e6bd08aa7fa08a37df3481df1938 --- table/plain_table_builder.h | 3 +++ table/plain_table_factory.h | 14 +++++++++++++- table/plain_table_index.h | 26 ++++++++++++++++++++++++-- table/plain_table_key_coding.h | 15 ++++++++++++--- table/plain_table_reader.h | 13 +++++++------ 5 files changed, 59 insertions(+), 12 deletions(-) diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index ca0879a4e1d..9a5b44b9c2c 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -24,6 +24,9 @@ class BlockHandle; class WritableFile; class TableBuilder; +// The builder class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. class PlainTableBuilder: public TableBuilder { public: // Create a builder that will store the contents of the table it is diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index dade1566096..1bd155f93e9 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -24,7 +24,19 @@ class WritableFile; class Table; class TableBuilder; -// IndexedTable requires fixed length key, configured as a constructor +// PlainTableFactory is the entrance function to the PlainTable format of +// SST files. It returns instances PlainTableBuilder as the builder +// class and PlainTableReader as the reader class, where the format is +// actually implemented. +// +// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. +// Data is not organized in blocks, which allows fast access. Because of +// following downsides +// 1. Data compression is not supported. +// 2. Data is not checksumed. +// it is not recommended to use this format on other type of file systems. +// +// PlainTable requires fixed length key, configured as a constructor // parameter of the factory class. Output file format: // +-------------+-----------------+ // | version | user_key_length | diff --git a/table/plain_table_index.h b/table/plain_table_index.h index 360d998279a..1457fd00d81 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -20,6 +20,12 @@ namespace rocksdb { +// The file contains two classes PlainTableIndex and PlainTableIndexBuilder +// The two classes implement the index format of PlainTable. +// For descripton of PlainTable format, see comments of class +// PlainTableFactory +// +// // PlainTableIndex contains buckets size of index_size_, each is a // 32-bit integer. The lower 31 bits contain an offset value (explained below) // and the first bit of the integer indicates type of the offset. @@ -55,6 +61,10 @@ namespace rocksdb { // .... // record N file offset: fixedint32 // + +// The class loads the index block from a PlainTable SST file, and executes +// the index lookup. +// The class is used by PlainTableReader class. class PlainTableIndex { public: enum IndexSearchResult { @@ -72,11 +82,22 @@ class PlainTableIndex { index_(nullptr), sub_index_(nullptr) {} + // The function that executes the lookup the hash table. + // The hash key is `prefix_hash`. The function fills the hash bucket + // content in `bucket_value`, which is up to the caller to interpret. IndexSearchResult GetOffset(uint32_t prefix_hash, uint32_t* bucket_value) const; - Status InitFromRawData(Slice data); + // Initialize data from `index_data`, which points to raw data for + // index stored in the SST file. + Status InitFromRawData(Slice index_data); + // Decode the sub index for specific hash bucket. + // The `offset` is the value returned as `bucket_value` by GetOffset() + // and is only valid when the return value is `kSubindex`. + // The return value is the pointer to the starting address of the + // sub-index. `upper_bound` is filled with the value indicating how many + // entries the sub-index has. const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, uint32_t* upper_bound) const { const char* index_ptr = &sub_index_[offset]; @@ -106,9 +127,10 @@ class PlainTableIndex { // After calling Finish(), it returns Slice, which is usually // used either to initialize PlainTableIndex or // to save index to sst file. -// For more details about the index, please refer to: +// For more details about the index, please refer to: // https://github.com/facebook/rocksdb/wiki/PlainTable-Format // #wiki-in-memory-index-format +// The class is used by PlainTableBuilder class. class PlainTableIndexBuilder { public: PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index 9a27ad06b78..93f8f7af4b5 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -11,6 +11,11 @@ #include "db/dbformat.h" #include "table/plain_table_reader.h" +// The file contains three helper classes of PlainTable format, +// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. +// These classes issue the lowest level of operations of PlainTable. +// Actual data format of the key is documented in comments of class +// PlainTableFactory. namespace rocksdb { class WritableFile; @@ -18,8 +23,8 @@ struct ParsedInternalKey; struct PlainTableReaderFileInfo; enum PlainTableEntryType : unsigned char; -// Helper class to write out a key to an output file -// Actual data format of the key is documented in plain_table_factory.h +// Helper class for PlainTable format to write out a key to an output file +// The class is used in PlainTableBuilder. class PlainTableKeyEncoder { public: explicit PlainTableKeyEncoder(EncodingType encoding_type, @@ -53,6 +58,10 @@ class PlainTableKeyEncoder { IterKey pre_prefix_; }; +// The class does raw file reads for PlainTableReader. +// It hides whether it is a mmap-read, or a non-mmap read. +// The class is implemented in a way to favor the performance of mmap case. +// The class is used by PlainTableReader. class PlainTableFileReader { public: explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) @@ -122,7 +131,7 @@ class PlainTableFileReader { }; // A helper class to decode keys from input buffer -// Actual data format of the key is documented in plain_table_factory.h +// The class is used by PlainTableBuilder. class PlainTableKeyDecoder { public: explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 14760f20a57..12b22aaf12e 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -56,16 +56,17 @@ struct PlainTableReaderFileInfo { file(std::move(_file)) {} }; +// The reader class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableReader: public TableReader { + public: // Based on following output file format shown in plain_table_factory.h -// When opening the output file, IndexedTableReader creates a hash table -// from key prefixes to offset of the output file. IndexedTable will decide +// When opening the output file, PlainTableReader creates a hash table +// from key prefixes to offset of the output file. PlainTable will decide // whether it points to the data offset of the first key with the key prefix // or the offset of it. If there are too many keys share this prefix, it will // create a binary search-able index from the suffix to offset on disk. -// -// The implementation of IndexedTableReader requires output file is mmaped -class PlainTableReader: public TableReader { - public: static Status Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, From 09b534cc2f36dc9e9ab13d1067fa8209456e9771 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 23 May 2019 16:26:07 -0700 Subject: [PATCH 056/572] improve comments for CompactionJob (#5341) Summary: add class/function level comments to the header file Pull Request resolved: https://github.com/facebook/rocksdb/pull/5341 Differential Revision: D15485442 Pulled By: miasantreble fbshipit-source-id: 9f11e2a1cd3ce0f4990f01353d0a6f4b050615cf --- db/compaction_job.cc | 6 ------ db/compaction_job.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index d1ae1932729..44fb385d1b3 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -415,7 +415,6 @@ void CompactionJob::Prepare() { write_hint_ = c->column_family_data()->CalculateSSTWriteHint(c->output_level()); - // Is this compaction producing files at the bottommost level? bottommost_level_ = c->bottommost_level(); if (c->ShouldFormSubcompactions()) { @@ -445,11 +444,6 @@ struct RangeWithSize { : range(a, b), size(s) {} }; -// Generates a histogram representing potential divisions of key ranges from -// the input. It adds the starting and/or ending keys of certain input files -// to the working set and then finds the approximate size of data in between -// each consecutive pair of slices. Then it divides these ranges into -// consecutive groups such that each group has a similar size. void CompactionJob::GenSubcompactionBoundaries() { auto* c = compact_->compaction; auto* cfd = c->column_family_data(); diff --git a/db/compaction_job.h b/db/compaction_job.h index b3a0f2eb4b5..a37c54de809 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -55,6 +55,11 @@ class Version; class VersionEdit; class VersionSet; +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. class CompactionJob { public: CompactionJob( @@ -80,17 +85,28 @@ class CompactionJob { CompactionJob& operator=(const CompactionJob& job) = delete; // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction void Prepare(); // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results Status Run(); // REQUIRED: mutex held + // Add compaction input/output to the current version Status Install(const MutableCFOptions& mutable_cf_options); private: struct SubcompactionState; void AggregateStatistics(); + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. void GenSubcompactionBoundaries(); // update the thread status for starting a compaction. @@ -163,6 +179,7 @@ class CompactionJob { EventLogger* event_logger_; + // Is this compaction creating a file in the bottom most level? bool bottommost_level_; bool paranoid_file_checks_; bool measure_io_stats_; From 6a54278b4a9b86a1cce359e78db61015e7a1cc07 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 23 May 2019 16:26:08 -0700 Subject: [PATCH 057/572] add class level comment for RepeatableThread Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5344 Differential Revision: D15485431 Pulled By: miasantreble fbshipit-source-id: 9c0f6cf0d826743e743012549976705ceb8cc0c4 --- util/repeatable_thread.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h index 967cc49945e..2d4729da02c 100644 --- a/util/repeatable_thread.h +++ b/util/repeatable_thread.h @@ -15,6 +15,9 @@ namespace rocksdb { +// Simple wrapper around port::Thread that supports calling a callback every +// X seconds. If you pass in 0, then it will call your callback repeatedly +// without delay. class RepeatableThread { public: RepeatableThread(std::function function, From 74a334a2eb8db6c2ba2f38382be287af908e63c0 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 23 May 2019 21:54:23 -0700 Subject: [PATCH 058/572] Provide an option so that SST ingestion won't fall back to copy after hard linking fails (#5333) Summary: RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named 'failed_move_fall_back_to_copy' for users to choose which behavior they want. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5333 Differential Revision: D15457597 Pulled By: HaoyuHuang fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214 --- HISTORY.md | 3 +- db/external_sst_file_ingestion_job.cc | 19 ++--- db/external_sst_file_test.cc | 109 ++++++++++++++++++++++---- include/rocksdb/options.h | 2 + 4 files changed, 106 insertions(+), 27 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b65f5a038b1..40d11096df0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. +* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. @@ -20,7 +21,7 @@ ### Bug Fixes - + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 28b481678ab..588ac5110a2 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare( // Copy/Move external files into DB for (IngestedFileInfo& f : files_to_ingest_) { f.fd = FileDescriptor(next_file_number++, 0, f.file_size); - + f.copy_file = false; const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); - if (ingestion_options_.move_files) { status = env_->LinkFile(path_outside_db, path_inside_db); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - status = CopyFile(env_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; - } else { - f.copy_file = false; } } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); status = CopyFile(env_, path_outside_db, path_inside_db, 0, db_options_.use_fsync); - f.copy_file = true; } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index cbbb2fa2627..3850a2a031e 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -16,6 +16,54 @@ namespace rocksdb { +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : DBTestBase("/external_sst_file_test"), + test_env_(new ExternalSSTTestEnv(env_, true)) { + sst_files_dir_ = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + std::string sst_files_dir_; + Options options_; + ExternalSSTTestEnv* test_env_; +}; + class ExternalSSTFileTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { } /* - * Test and verify the functionality of ingestion_options.move_files. + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy */ -TEST_F(ExternalSSTFileTest, LinkExternalSst) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - DestroyAndReopen(options); +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; std::string file_path = sst_files_dir_ + "file1.sst"; // Create SstFileWriter for default column family - SstFileWriter sst_file_writer(EnvOptions(), options); + SstFileWriter sst_file_writer(env_options, options_); ASSERT_OK(sst_file_writer.Open(file_path)); for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); @@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file_path, &file_size)); - IngestExternalFileOptions ifo; - ifo.move_files = true; - ASSERT_OK(db_->IngestExternalFile({file_path}, ifo)); + bool copyfile = false; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); @@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { bytes_copied += stats.bytes_written; bytes_moved += stats.bytes_moved; } - // If bytes_moved > 0, it means external sst resides on the same FS - // supporting hard link operation. Therefore, - // 0 bytes should be copied, and the bytes_moved == file_size. - // Otherwise, FS does not support hard link, or external sst file resides on - // a different file system, then the bytes_copied should be equal to - // file_size. - if (bytes_moved > 0) { + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); ASSERT_EQ(0, bytes_copied); ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); } else { - ASSERT_EQ(file_size, bytes_copied); + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } class TestIngestExternalFileListener : public EventListener { @@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, std::make_tuple(true, false), std::make_tuple(true, true))); +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 7d22fb67559..cc7119410a0 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1398,6 +1398,8 @@ struct CompactRangeOptions { struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true; From 5d359fc337803b1b365c7d151799e4d76f75b024 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 24 May 2019 10:02:36 -0700 Subject: [PATCH 059/572] Document AlignedBuffer (#5345) Summary: Add comments to util/aligned_buffer.h Pull Request resolved: https://github.com/facebook/rocksdb/pull/5345 Differential Revision: D15496004 Pulled By: sagar0 fbshipit-source-id: 31bc6f35e88dedd74cff55febe02c9e761304f76 --- util/aligned_buffer.h | 74 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index 2201b487770..dbff9c8109e 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -13,21 +13,47 @@ namespace rocksdb { +// This file contains utilities to handle the alignment of pages and buffers. + +// Truncate to a multiple of page_size, which is also a page boundary. This +// helps to figuring out the right alignment. +// Example: +// TruncateToPageBoundary(5000, 4096) => 4096 +// TruncateToPageBoundary(10000, 4096) => 8192 inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { s -= (s & (page_size - 1)); assert((s % page_size) == 0); return s; } +// Round up x to a multiple of y. +// Example: +// Roundup(13, 5) => 15 +// Roundup(201, 16) => 208 inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } +// Round down x to a multiple of y. +// Example: +// Rounddown(13, 5) => 10 +// Rounddown(201, 16) => 192 inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; } -// This class is to manage an aligned user -// allocated buffer for direct I/O purposes -// though can be used for any purpose. +// AlignedBuffer manages a buffer by taking alignment into consideration, and +// aligns the buffer start and end positions. It is mainly used for direct I/O, +// though it can be used other purposes as well. +// It also supports expanding the managed buffer, and copying whole or part of +// the data from old buffer into the new expanded buffer. Such a copy especially +// helps in cases avoiding an IO to re-fetch the data from disk. +// +// Example: +// AlignedBuffer buf; +// buf.Alignment(alignment); +// buf.AllocateNewBuffer(user_requested_buf_size); +// ... +// buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true, +// copy_offset, copy_len); class AlignedBuffer { size_t alignment_; std::unique_ptr buf_; @@ -96,12 +122,21 @@ class AlignedBuffer { alignment_ = alignment; } - // Allocates a new buffer and sets bufstart_ to the aligned first byte. + // Allocates a new buffer and sets the start position to the first aligned + // byte. + // // requested_capacity: requested new buffer capacity. This capacity will be // rounded up based on alignment. - // copy_data: Copy data from old buffer to new buffer. + // copy_data: Copy data from old buffer to new buffer. If copy_offset and + // copy_len are not passed in and the new requested capacity is bigger + // than the existing buffer's capacity, the data in the exising buffer is + // fully copied over to the new buffer. // copy_offset: Copy data from this offset in old buffer. // copy_len: Number of bytes to copy. + // + // The function does nothing if the new requested_capacity is smaller than + // the current buffer capacity and copy_data is true i.e. the old buffer is + // retained as is. void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false, uint64_t copy_offset = 0, size_t copy_len = 0) { assert(alignment_ > 0); @@ -110,7 +145,7 @@ class AlignedBuffer { copy_len = copy_len > 0 ? copy_len : cursize_; if (copy_data && requested_capacity < copy_len) { // If we are downsizing to a capacity that is smaller than the current - // data in the buffer. Ignore the request. + // data in the buffer -- Ignore the request. return; } @@ -132,8 +167,15 @@ class AlignedBuffer { capacity_ = new_capacity; buf_.reset(new_buf); } - // Used for write - // Returns the number of bytes appended + + // Append to the buffer. + // + // src : source to copy the data from. + // append_size : number of bytes to copy from src. + // Returns the number of bytes appended. + // + // If append_size is more than the remaining buffer size only the + // remaining-size worth of bytes are copied. size_t Append(const char* src, size_t append_size) { size_t buffer_remaining = capacity_ - cursize_; size_t to_copy = std::min(append_size, buffer_remaining); @@ -145,6 +187,12 @@ class AlignedBuffer { return to_copy; } + // Read from the buffer. + // + // dest : destination buffer to copy the data to. + // offset : the buffer offset to start reading from. + // read_size : the number of bytes to copy from the buffer to dest. + // Returns the number of bytes read/copied to dest. size_t Read(char* dest, size_t offset, size_t read_size) const { assert(offset < cursize_); @@ -158,7 +206,7 @@ class AlignedBuffer { return to_read; } - /// Pad to alignment + // Pad to the end of alignment with "padding" void PadToAlignmentWith(int padding) { size_t total_size = Roundup(cursize_, alignment_); size_t pad_size = total_size - cursize_; @@ -176,7 +224,7 @@ class AlignedBuffer { cursize_ += pad_size; } - // After a partial flush move the tail to the beginning of the buffer + // After a partial flush move the tail to the beginning of the buffer. void RefitTail(size_t tail_offset, size_t tail_size) { if (tail_size > 0) { memmove(bufstart_, bufstart_ + tail_offset, tail_size); @@ -184,7 +232,11 @@ class AlignedBuffer { cursize_ = tail_size; } - // Returns place to start writing + // Returns a place to start appending. + // WARNING: Note that it is possible to write past the end of the buffer if + // the buffer is modified without using the write APIs or encapsulation + // offered by AlignedBuffer. It is up to the user to guard against such + // errors. char* Destination() { return bufstart_ + cursize_; } From 94c78b11e411d15f23bbc0c3c3f95c7e070ea528 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 24 May 2019 10:27:28 -0700 Subject: [PATCH 060/572] improve comments for statistics.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5351 Differential Revision: D15496346 Pulled By: miasantreble fbshipit-source-id: eeb619e6bd8616003ba35b0cd4bb8050e6a8cb4d --- include/rocksdb/statistics.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 3b2b2e048c7..653b460cbdd 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -447,6 +447,10 @@ struct HistogramData { double min = 0.0; }; +// StatsLevel can be used to reduce statistics overhead by skipping certain +// types of stats in the stats collection process. +// Usage: +// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); enum StatsLevel : uint8_t { // Disable timer stats, and skip histogram stats kExceptHistogramOrTimers, @@ -464,7 +468,15 @@ enum StatsLevel : uint8_t { kAll, }; -// Analyze the performance of a db +// Analyze the performance of a db by providing cumulative stats over time. +// Usage: +// Options options; +// options.statistics = rocksdb::CreateDBStatistics(); +// Status s = DB::Open(options, kDBPath, &db); +// ... +// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +// HistogramData hist; +// options.statistics->histogramData(FLUSH_TIME, &hist); class Statistics { public: virtual ~Statistics() {} From 88ff80780b3ccdbf802625c8302b9e4405a09b66 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 24 May 2019 10:36:26 -0700 Subject: [PATCH 061/572] improve comment for WalManager (#5350) Summary: att Pull Request resolved: https://github.com/facebook/rocksdb/pull/5350 Differential Revision: D15496467 Pulled By: miasantreble fbshipit-source-id: c29c0b143bf4df2040695a82be0feb9814ddb641 --- db/wal_manager.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/wal_manager.h b/db/wal_manager.h index 6caf1640c06..9d5afb25d5e 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -28,6 +28,10 @@ namespace rocksdb { #ifndef ROCKSDB_LITE + +// WAL manager provides the abstraction for reading the WAL files as a single +// unit. Internally, it opens and reads the files using Reader or Writer +// abstraction. class WalManager { public: WalManager(const ImmutableDBOptions& db_options, @@ -40,6 +44,8 @@ class WalManager { Status GetSortedWalFiles(VectorLogPtr& files); + // Allow user to tail transaction log to find all recent changes to the + // database that are newer than `seq_number`. Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options, From 98094f6caca6a5c0d2cff4c36f3bfdc7c1fcb7b6 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 24 May 2019 11:16:47 -0700 Subject: [PATCH 062/572] Add some comments for BlockContents Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5354 Differential Revision: D15496645 Pulled By: ltamasi fbshipit-source-id: 1282b1ce11fbc412d3d87b2688fd0586e7bb6b85 --- table/format.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/table/format.h b/table/format.h index f5858850559..84242303ec7 100644 --- a/table/format.h +++ b/table/format.h @@ -194,6 +194,10 @@ inline CompressionType get_block_compression_type(const char* block_data, return static_cast(block_data[block_size]); } +// Represents the contents of a block read from an SST file. Depending on how +// it's created, it may or may not own the actual block bytes. As an example, +// BlockContents objects representing data read from mmapped files only point +// into the mmapped region. struct BlockContents { Slice data; // Actual contents of data CacheAllocationPtr allocation; @@ -206,16 +210,20 @@ struct BlockContents { BlockContents() {} + // Does not take ownership of the underlying data bytes. BlockContents(const Slice& _data) : data(_data) {} + // Takes ownership of the underlying data bytes. BlockContents(CacheAllocationPtr&& _data, size_t _size) : data(_data.get(), _size), allocation(std::move(_data)) {} + // Takes ownership of the underlying data bytes. BlockContents(std::unique_ptr&& _data, size_t _size) : data(_data.get(), _size) { allocation.reset(_data.release()); } + // Returns whether the object has ownership of the underlying data bytes. bool own_bytes() const { return allocation.get() != nullptr; } // It's the caller's responsibility to make sure that this is From 767d1f3ff17b002659f48a520c84fbb09f6ac3fc Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 24 May 2019 11:37:06 -0700 Subject: [PATCH 063/572] Improve comments for StatsHistoryIterator and InMemoryStatsHistoryIterator Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5346 Differential Revision: D15497679 Pulled By: miasantreble fbshipit-source-id: c10caf10293c3d9663bfb398a0d331326d1e9e67 --- db/db_impl.h | 1 - db/in_memory_stats_history.cc | 4 ++++ db/in_memory_stats_history.h | 19 +++++++++++++++++++ include/rocksdb/db.h | 5 +++-- include/rocksdb/stats_history.h | 20 +++++++++++++++++++- 5 files changed, 45 insertions(+), 4 deletions(-) diff --git a/db/db_impl.h b/db/db_impl.h index 08cb1949118..f574a8f4479 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -761,7 +761,6 @@ class DBImpl : public DB { static Status CreateAndNewDirectory(Env* env, const std::string& dirname, std::unique_ptr* directory); - // Given a time window, return an iterator for accessing stats history Status GetStatsHistory( uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; diff --git a/db/in_memory_stats_history.cc b/db/in_memory_stats_history.cc index 39355cfbe0a..e9e0cc74950 100644 --- a/db/in_memory_stats_history.cc +++ b/db/in_memory_stats_history.cc @@ -17,6 +17,10 @@ bool InMemoryStatsHistoryIterator::Valid() const { return valid_; } Status InMemoryStatsHistoryIterator::status() const { return status_; } +// Because of garbage collection, the next stats snapshot may or may not be +// right after the current one. When reading from DBImpl::stats_history_, this +// call will be protected by DB Mutex so it will not return partial or +// corrupted results. void InMemoryStatsHistoryIterator::Next() { // increment start_time by 1 to avoid infinite loop AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); diff --git a/db/in_memory_stats_history.h b/db/in_memory_stats_history.h index 4b52e23fffa..eeb679cc0a2 100644 --- a/db/in_memory_stats_history.h +++ b/db/in_memory_stats_history.h @@ -12,8 +12,20 @@ namespace rocksdb { +// InMemoryStatsHistoryIterator can be used to access stats history that was +// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps +// a copy of the stats snapshot (in stats_map_) that is currently being pointed +// to, which allows the iterator to access the stats snapshot even when +// the background garbage collecting thread purges it from the source of truth +// (`DBImpl::stats_history_`). In that case, the iterator will continue to be +// valid until a call to `Next()` returns no result and invalidates it. In +// some extreme cases, the iterator may also return fragmented segments of +// stats snapshots due to long gaps between `Next()` calls and interleaved +// garbage collection. class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { public: + // Setup InMemoryStatsHistoryIterator to return stats snapshots between + // microsecond timestamps [start_time, end_time) InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time, DBImpl* db_impl) : start_time_(start_time), @@ -26,9 +38,16 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { bool Valid() const override; Status status() const override; + // Move to the next stats snapshot currently available + // This function may invalidate the iterator + // REQUIRES: Valid() void Next() override; + + // REQUIRES: Valid() uint64_t GetStatsTime() const override; + // This function is idempotent + // REQUIRES: Valid() const std::map& GetStatsMap() const override; private: diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 7b49b92c239..b0538433b4a 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1322,8 +1322,9 @@ class DB { // Needed for StackableDB virtual DB* GetRootDB() { return this; } - // Given a time window, return an iterator for accessing stats history - // User is responsible for deleting StatsHistoryIterator after use + // Given a window [start_time, end_time), setup a StatsHistoryIterator + // to access stats history. Note the start_time and end_time are epoch + // time measured in microsecond, and end_time is an exclusive bound. virtual Status GetStatsHistory( uint64_t /*start_time*/, uint64_t /*end_time*/, std::unique_ptr* /*stats_iterator*/) { diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h index 40ea51d1ff0..1a841908170 100644 --- a/include/rocksdb/stats_history.h +++ b/include/rocksdb/stats_history.h @@ -11,7 +11,6 @@ #include #include -// #include "db/db_impl.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" @@ -19,6 +18,25 @@ namespace rocksdb { class DBImpl; +// StatsHistoryIterator is the main interface for users to programmatically +// access statistics snapshots that was automatically stored by RocksDB. +// Depending on options, the stats can be in memory or on disk. +// The stats snapshots are indexed by time that they were recorded, and each +// stats snapshot contains individual stat name and value at the time of +// recording. +// Example: +// std::unique_ptr stats_iter; +// Status s = db->GetStatsHistory(0 /* start_time */, +// env->NowMicros() /* end_time*/, +// &stats_iter); +// if (s.ok) { +// for (; stats_iter->Valid(); stats_iter->Next()) { +// uint64_t stats_time = stats_iter->GetStatsTime(); +// const std::map& stats_map = +// stats_iter->GetStatsMap(); +// process(stats_time, stats_map); +// } +// } class StatsHistoryIterator { public: StatsHistoryIterator() {} From 596cc1547a01b8299293f9fb43f219722eeb6dad Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 24 May 2019 12:03:16 -0700 Subject: [PATCH 064/572] Update comments in column_family.h (#5347) Summary: Document relationships of data structures declared in column_family.h Pull Request resolved: https://github.com/facebook/rocksdb/pull/5347 Differential Revision: D15496941 Pulled By: siying fbshipit-source-id: 47b37835abba26aa31a94fabea6b2775483e0ccb --- db/column_family.h | 108 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/db/column_family.h b/db/column_family.h index 7a1ae85bfd3..655cb159261 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -45,7 +45,113 @@ class InstrumentedMutexLock; struct SuperVersionContext; extern const double kIncSlowdownRatio; - +// This file contains a list of data structures for managing column family +// level metadata. +// +// The basic relationships among classes declared here are illustrated as +// following: +// +// +----------------------+ +----------------------+ +--------+ +// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | +// | +----------------------+ | +----------------------+ +----+---+ +// | +--------------------------+ | +// | | +-----------------------------+ +// | | | +// | | +-----------------------------v-------------------------------+ +// | | | | +// | | | ColumnFamilySet | +// | | | | +// | | +-------------+--------------------------+----------------+---+ +// | | | | | +// | +-------------------------------------+ | | +// | | | | v +// | +-------------v-------------+ +-----v----v---------+ +// | | | | | +// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... +// | | | | | +// +---> | | | +// | +---------+ | | +// | | MemTable| | | +// | | List | | | +// +--------+---+--+-+----+----+ +--------------------++ +// | | | | +// | | | | +// | | | +-----------------------+ +// | | +-----------+ | +// v +--------+ | | +// +--------+--------+ | | | +// | | | | +----------v----------+ +// +---> |SuperVersion 1.a +-----------------> | +// | +------+ | | MemTableListVersion | +// +---+-------------+ | | | | | +// | | | | +----+------------+---+ +// | current | | | | | +// | +-------------+ | |mem | | +// | | | | | | +// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ +// | | | | | | | | +// | Version 1.a | | memtable | | memtable | | memtable | +// | | | 1.a | | 1.b | | 1.c | +// +-------------+ | | | | | | +// +----------+ +----------+ +----------+ +// +// DBImpl keeps a ColumnFamilySet, which references to all column families by +// pointing to respective ColumnFamilyData object of each column family. +// This is how DBImpl can list and operate on all the column families. +// ColumnFamilyHandle also points to ColumnFamilyData directly, so that +// when a user executes a query, it can directly find memtables and Version +// as well as SuperVersion to the column family, without going through +// ColumnFamilySet. +// +// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables +// and SST files) indirectly, while ongoing operations may hold references +// to a current or an out-of-date SuperVersion, which in turn points to a +// point-in-time view of the LSM-tree. This guarantees the memtables and SST +// files being operated on will not go away, until the SuperVersion is +// unreferenced to 0 and destoryed. +// +// The following graph illustrates a possible referencing relationships: +// +// Column +--------------+ current +-----------+ +// Family +---->+ +------------------->+ | +// Data | SuperVersion +----------+ | Version A | +// | 3 | imm | | | +// Iter2 +----->+ | +-------v------+ +-----------+ +// +-----+--------+ | MemtableList +----------------> Empty +// | | Version r | +-----------+ +// | +--------------+ | | +// +------------------+ current| Version B | +// +--------------+ | +----->+ | +// | | | | +-----+-----+ +// Compaction +>+ SuperVersion +-------------+ ^ +// Job | 2 +------+ | |current +// | +----+ | | mem | +------------+ +// +--------------+ | | +---------------------> | +// | +------------------------> MemTable a | +// | mem | | | +// +--------------+ | | +------------+ +// | +--------------------------+ +// Iter1 +-----> SuperVersion | | +------------+ +// | 1 +------------------------------>+ | +// | +-+ | mem | MemTable b | +// +--------------+ | | | | +// | | +--------------+ +-----^------+ +// | |imm | MemtableList | | +// | +--->+ Version s +------------+ +// | +--------------+ +// | +--------------+ +// | | MemtableList | +// +------>+ Version t +--------> Empty +// imm +--------------+ +// +// In this example, even if the current LSM-tree consists of Version A and +// memtable a, which is also referenced by SuperVersion, two older SuperVersion +// SuperVersion2 and Superversion1 still exist, and are referenced by a +// compaction job and an old iterator Iter1, respectively. SuperVersion2 +// contains Version B, memtable a and memtable b; SuperVersion1 contains +// Version B and memtable b (mutable). As a result, Version B and memtable b +// are prevented from being destroyed or deleted. + // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client // is done using the column family From f69e63dc5fa99277bc1e1ef6140383207be3c8ac Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 24 May 2019 12:20:14 -0700 Subject: [PATCH 065/572] Improve comments in compaction.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5356 Differential Revision: D15499033 Pulled By: siying fbshipit-source-id: 069ae48669484beaf668dd90389b8743b3309dc3 --- db/compaction.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/db/compaction.h b/db/compaction.h index 2cf737b676a..e9ded632503 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -14,6 +14,8 @@ #include "util/autovector.h" namespace rocksdb { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. // Utility for comparing sstable boundary keys. Returns -1 if either a or b is // null which provides the property that a==null indicates a key that is less @@ -63,7 +65,7 @@ class ColumnFamilyData; class VersionStorageInfo; class CompactionFilter; -// A Compaction encapsulates information about a compaction. +// A Compaction encapsulates metadata about a compaction. class Compaction { public: Compaction(VersionStorageInfo* input_version, @@ -376,7 +378,7 @@ class Compaction { CompactionReason compaction_reason_; }; -// Utility function +// Return sum of sizes of all files in `files`. extern uint64_t TotalFileSize(const std::vector& files); } // namespace rocksdb From f66026c8c7a93473854966519d56c5d4fa115b24 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Fri, 24 May 2019 12:26:58 -0700 Subject: [PATCH 066/572] Comments for BlockBasedTable Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5352 Differential Revision: D15498477 Pulled By: vjnadimpalli fbshipit-source-id: 08a981521848433362a56ac521c7fb83c7dd7b2a --- table/block_based_table_reader.h | 19 +++++++++++++++---- table/table_reader.h | 8 +++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 8274f0cf965..270409b3ab6 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -59,9 +59,17 @@ class GetContext; typedef std::vector> KVPairBlock; -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. A Table may be safely accessed from -// multiple threads without external synchronization. +// Reader class for BlockBasedTable format. +// For the format of BlockBasedTable refer to +// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. +// This is the default table type. Data is chucked into fixed size blocks and +// each block in-turn stores entries. When storing data, we can compress and/or +// encode data efficiently within a block, which often results in a much smaller +// data size compared with the raw data size. As for the record retrieval, we'll +// first locate the block where target record may reside, then read the block to +// memory, and finally search that record within the block. Of course, to avoid +// frequent reads of the same block, we introduced the block cache to keep the +// loaded blocks in the memory. class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; @@ -425,7 +433,7 @@ class BlockBasedTable : public TableReader { friend class PartitionedFilterBlockTest; }; -// Maitaning state of a two-level iteration on a partitioned index structure +// Maitaning state of a two-level iteration on a partitioned index structure. class BlockBasedTable::PartitionedIndexIteratorState : public TwoLevelIteratorState { public: @@ -444,6 +452,8 @@ class BlockBasedTable::PartitionedIndexIteratorState bool index_key_is_full_; }; +// Stores all the properties associated with a BlockBasedTable. +// These are immutable. struct BlockBasedTable::Rep { Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, @@ -553,6 +563,7 @@ struct BlockBasedTable::Rep { } }; +// Iterates over the contents of BlockBasedTable. template class BlockBasedTableIterator : public InternalIteratorBase { public: diff --git a/table/table_reader.h b/table/table_reader.h index bd6071d9c67..037dbc33818 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -26,9 +26,11 @@ struct TableProperties; class GetContext; class MultiGetContext; -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. A Table may be safely accessed from -// multiple threads without external synchronization. +// A Table (also referred to as SST) is a sorted map from strings to strings. +// Tables are immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. Table readers are used +// for reading various types of table formats supported by rocksdb including +// BlockBasedTable, PlainTable and CuckooTable format. class TableReader { public: virtual ~TableReader() {} From 6267ed251ae5162b7b5c41521061e5541af301f5 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 24 May 2019 13:05:58 -0700 Subject: [PATCH 067/572] Improve comment in db_impl.h (#5338) Summary: Add some comments in db_impl.h. Also reordered function order a little bit so that I can add a comment to flag the area of functions implementing DB interface. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5338 Differential Revision: D15498284 Pulled By: siying fbshipit-source-id: 3d7c59c8303577fe44d13c74ae84c7ce05164f77 --- db/db_impl.h | 355 ++++++++++++++++++++++++++++----------------------- 1 file changed, 193 insertions(+), 162 deletions(-) diff --git a/db/db_impl.h b/db/db_impl.h index f574a8f4479..f2544e85941 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -75,16 +75,30 @@ struct JobContext; struct ExternalSstFileInfo; struct MemTableInfo; +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. class DBImpl : public DB { public: DBImpl(const DBOptions& options, const std::string& dbname, const bool seq_per_batch = false, const bool batch_per_txn = true); virtual ~DBImpl(); + // ---- Implementations of the DB interface ---- + using DB::Resume; virtual Status Resume() override; - // Implementations of the DB interface using DB::Put; virtual Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, @@ -110,13 +124,6 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; - // Function that Get and KeyMayExist call with no_io true or false - // Note: 'value_found' from KeyMayExist propagates here - Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - bool* value_found = nullptr, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); - using DB::MultiGet; virtual std::vector MultiGet( const ReadOptions& options, @@ -174,12 +181,6 @@ class DBImpl : public DB { const ReadOptions& options, const std::vector& column_families, std::vector* iterators) override; - ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, - ColumnFamilyData* cfd, - SequenceNumber snapshot, - ReadCallback* read_callback, - bool allow_blob = false, - bool allow_refresh = true); virtual const Snapshot* GetSnapshot() override; virtual void ReleaseSnapshot(const Snapshot* snapshot) override; @@ -259,23 +260,19 @@ class DBImpl : public DB { virtual Status UnlockWAL() override; virtual SequenceNumber GetLatestSequenceNumber() const override; - virtual SequenceNumber GetLastPublishedSequence() const { - if (last_seq_same_as_publish_seq_) { - return versions_->LastSequence(); - } else { - return versions_->LastPublishedSequence(); - } - } - // REQUIRES: joined the main write queue if two_write_queues is disabled, and - // the second write queue otherwise. - virtual void SetLastPublishedSequence(SequenceNumber seq); - // Returns LastSequence in last_seq_same_as_publish_seq_ - // mode and LastAllocatedSequence otherwise. This is useful when visiblility - // depends also on data written to the WAL but not to the memtable. - SequenceNumber TEST_GetLastVisibleSequence() const; virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override; + virtual Status GetDbIdentity(std::string& identity) const override; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + virtual Status Close() override; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; @@ -313,12 +310,76 @@ class DBImpl : public DB { Status PromoteL0(ColumnFamilyHandle* column_family, int target_level) override; + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + virtual Status VerifyChecksum() override; + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + +#endif // ROCKSDB_LITE + + // ---- End of implementations of the DB interface ---- + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + bool* value_found = nullptr, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + +#ifndef ROCKSDB_LITE // Similar to Write() but will call the callback once on the single write // thread to determine whether it is safe to perform the write. virtual Status WriteWithCallback(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback); + // Returns the sequence number that is guaranteed to be smaller than or equal // to the sequence number of any key that could be inserted into the current // memtables. It can then be assumed that any write with a larger(or equal) @@ -360,25 +421,6 @@ class DBImpl : public DB { bool* found_record_for_key, bool* is_blob_index = nullptr); - using DB::IngestExternalFile; - virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& ingestion_options) override; - - using DB::IngestExternalFiles; - virtual Status IngestExternalFiles( - const std::vector& args) override; - - virtual Status VerifyChecksum() override; - - using DB::StartTrace; - virtual Status StartTrace( - const TraceOptions& options, - std::unique_ptr&& trace_writer) override; - - using DB::EndTrace; - virtual Status EndTrace() override; Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); #endif // ROCKSDB_LITE @@ -393,8 +435,6 @@ class DBImpl : public DB { // match to our in-memory records virtual Status CheckConsistency(); - virtual Status GetDbIdentity(std::string& identity) const override; - // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering @@ -416,102 +456,6 @@ class DBImpl : public DB { return &logs_with_prep_tracker_; } -#ifndef NDEBUG - // Extra methods (for testing) that are not in the public DB interface - // Implemented in db_impl_debug.cc - - // Compact any files in the named level that overlap [*begin, *end] - Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, - ColumnFamilyHandle* column_family = nullptr, - bool disallow_trivial_move = false); - - void TEST_SwitchWAL(); - - bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } - - bool TEST_IsLogGettingFlushed() { - return alive_log_files_.begin()->getting_flushed; - } - - Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); - - // Force current memtable contents to be flushed. - Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, - ColumnFamilyHandle* cfh = nullptr); - - // Wait for memtable compaction - Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); - - // Wait for any compaction - // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this - // is only for the special test of CancelledCompactions - Status TEST_WaitForCompact(bool waitUnscheduled = false); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes( - ColumnFamilyHandle* column_family = nullptr); - - // Return the current manifest file no. - uint64_t TEST_Current_Manifest_FileNo(); - - // Returns the number that'll be assigned to the next file that's created. - uint64_t TEST_Current_Next_FileNo(); - - // get total level0 file size. Only for testing. - uint64_t TEST_GetLevel0TotalSize(); - - void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, - std::vector>* metadata); - - void TEST_LockMutex(); - - void TEST_UnlockMutex(); - - // REQUIRES: mutex locked - void* TEST_BeginWrite(); - - // REQUIRES: mutex locked - // pass the pointer that you got from TEST_BeginWrite() - void TEST_EndWrite(void* w); - - uint64_t TEST_MaxTotalInMemoryState() const { - return max_total_in_memory_state_; - } - - size_t TEST_LogsToFreeSize(); - - uint64_t TEST_LogfileNumber(); - - uint64_t TEST_total_log_size() const { return total_log_size_; } - - // Returns column family name to ImmutableCFOptions map. - Status TEST_GetAllImmutableCFOptions( - std::unordered_map* iopts_map); - - // Return the lastest MutableCFOptions of a column family - Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, - MutableCFOptions* mutable_cf_options); - - Cache* TEST_table_cache() { return table_cache_.get(); } - - WriteController& TEST_write_controler() { return write_controller_; } - - uint64_t TEST_FindMinLogContainingOutstandingPrep(); - uint64_t TEST_FindMinPrepLogReferencedByMemTable(); - size_t TEST_PreparedSectionCompletedSize(); - size_t TEST_LogsWithPrepSize(); - - int TEST_BGCompactionsAllowed() const; - int TEST_BGFlushesAllowed() const; - size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - void TEST_WaitForDumpStatsRun(std::function callback) const; - void TEST_WaitForPersistStatsRun(std::function callback) const; - bool TEST_IsPersistentStatsEnabled() const; - size_t TEST_EstiamteStatsHistorySize() const; - -#endif // NDEBUG - struct BGJobLimits { int max_flushes; int max_compactions; @@ -555,12 +499,15 @@ class DBImpl : public DB { void PurgeObsoleteFiles(JobContext& background_contet, bool schedule_only = false); + // Schedule a background job to actually delete obsolete files. void SchedulePurge(); - ColumnFamilyHandle* DefaultColumnFamily() const override; - const SnapshotList& snapshots() const { return snapshots_; } + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. void LoadSnapshots(std::vector* snap_vector, SequenceNumber* oldest_write_conflict_snapshot, const SequenceNumber& max_seq) const { @@ -572,6 +519,10 @@ class DBImpl : public DB { return immutable_db_options_; } + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. void CancelAllBackgroundWork(bool wait); // Find Super version and reference it. Based on options, it might return @@ -748,6 +699,8 @@ class DBImpl : public DB { InstrumentedMutex* mutex() const { return &mutex_; } + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. Status NewDB(); // This is to be used only by internal rocksdb classes. @@ -756,21 +709,109 @@ class DBImpl : public DB { std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn); - virtual Status Close() override; static Status CreateAndNewDirectory(Env* env, const std::string& dirname, std::unique_ptr* directory); - Status GetStatsHistory( - uint64_t start_time, uint64_t end_time, - std::unique_ptr* stats_iterator) override; - // find stats map from stats_history_ with smallest timestamp in // the range of [start_time, end_time) bool FindStatsByTime(uint64_t start_time, uint64_t end_time, uint64_t* new_time, std::map* stats_map); +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + void TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForDumpStatsRun(std::function callback) const; + void TEST_WaitForPersistStatsRun(std::function callback) const; + bool TEST_IsPersistentStatsEnabled() const; + size_t TEST_EstiamteStatsHistorySize() const; + +#endif // NDEBUG + protected: Env* const env_; const std::string dbname_; @@ -1700,16 +1741,6 @@ class DBImpl : public DB { ColumnFamilyData* cfd, SuperVersionContext* sv_context, const MutableCFOptions& mutable_cf_options); -#ifndef ROCKSDB_LITE - using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override; - virtual Status GetPropertiesOfTablesInRange( - ColumnFamilyHandle* column_family, const Range* range, std::size_t n, - TablePropertiesCollection* props) override; - -#endif // ROCKSDB_LITE bool GetIntPropertyInternal(ColumnFamilyData* cfd, const DBPropertyInfo& property_info, From eb7647ee6ce96fdeb3f49a341463efab50cc7658 Mon Sep 17 00:00:00 2001 From: anand76 Date: Fri, 24 May 2019 13:24:52 -0700 Subject: [PATCH 068/572] Add comments t get_context.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5353 Differential Revision: D15497912 Pulled By: anand1976 fbshipit-source-id: 72cff2465ca342aa810f925be5a7016b938aa416 --- table/get_context.h | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/table/get_context.h b/table/get_context.h index 7ed316f0e1a..856e01a9502 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -17,6 +17,9 @@ namespace rocksdb { class MergeContext; class PinnedIteratorsManager; +// Data structure for accumulating statistics during a point lookup. At the +// end of the point lookup, the corresponding ticker stats are updated. This +// avoids the overhead of frequent ticker stats updates struct GetContextStats { uint64_t num_cache_hit = 0; uint64_t num_cache_index_hit = 0; @@ -41,8 +44,17 @@ struct GetContextStats { uint64_t num_cache_compression_dict_bytes_insert = 0; }; +// A class to hold context about a point lookup, such as pointer to value +// slice, key, merge context etc, as well as the current state of the +// lookup. Any user using GetContext to track the lookup result must call +// SaveValue() whenever the internal key is found. This can happen +// repeatedly in case of merge operands. In case the key may exist with +// high probability, but IO is required to confirm and the user doesn't allow +// it, MarkKeyMayExist() must be called instead of SaveValue(). class GetContext { public: + // Current state of the point lookup. All except kNotFound and kMerge are + // terminal states enum GetState { kNotFound, kFound, @@ -53,6 +65,19 @@ class GetContext { }; GetContextStats get_context_stats_; + // Constructor + // @param value_found If non-nullptr, set to false if key may be present + // but we can't be certain because we cannot do IO + // @param max_covering_tombstone_seq Pointer to highest sequence number of + // range deletion covering the key. When an internal key + // is found with smaller sequence number, the lookup + // terminates + // @param seq If non-nullptr, the sequence number of the found key will be + // saved here + // @param callback Pointer to ReadCallback to perform additional checks + // for visibility of a key + // @param is_blob_index If non-nullptr, will be used to indicate if a found + // key is of type blob index GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* value, bool* value_found, @@ -64,13 +89,15 @@ class GetContext { GetContext() = default; + // This can be called to indicate that a key may be present, but cannot be + // confirmed due to IO not allowed void MarkKeyMayExist(); // Records this key, value, and any meta-data (such as sequence number and // state) into this GetContext. // // If the parsed_key matches the user key that we are looking for, sets - // mathced to true. + // matched to true. // // Returns True if more keys need to be read (due to merges) or // False if the complete value has been found. @@ -133,6 +160,9 @@ class GetContext { bool* is_blob_index_; }; +// Call this to replay a log and bring the get_context up to date. The replay +// log must have been created by another GetContext object, whose replay log +// must have been set by calling GetContext::SetReplayLog(). void replayGetContextLog(const Slice& replay_log, const Slice& user_key, GetContext* get_context, Cleanable* value_pinner = nullptr); From 029b98984e2f6babc2526362ddfffeea0798d625 Mon Sep 17 00:00:00 2001 From: anand76 Date: Fri, 24 May 2019 14:22:42 -0700 Subject: [PATCH 069/572] Add some comments in table_cache.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5343 Differential Revision: D15485831 Pulled By: anand1976 fbshipit-source-id: 8735ccfba90d7ecb3559e63f792e34527f04ed29 --- db/table_cache.h | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/db/table_cache.h b/db/table_cache.h index 1e96dfa1bd5..64d7b898b22 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -32,6 +32,19 @@ struct FileDescriptor; class GetContext; class HistogramImpl; +// Manages caching for TableReader objects for a column family. The actual +// cache is allocated separately and passed to the constructor. TableCache +// wraps around the underlying SST file readers by providing Get(), +// MultiGet() and NewIterator() methods that hide the instantiation, +// caching and access to the TableReader. The main purpose of this is +// performance - by caching the TableReader, it avoids unnecessary file opens +// and object allocation and instantiation. One exception is compaction, where +// a new TableReader may be instantiated - see NewIterator() comments +// +// Another service provided by TableCache is managing the row cache - if the +// DB is configured with a row cache, and the lookup key is present in the row +// cache, lookup is very fast. The row cache is obtained from +// ioptions.row_cache class TableCache { public: TableCache(const ImmutableCFOptions& ioptions, @@ -39,14 +52,16 @@ class TableCache { ~TableCache(); // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-nullptr, also sets "*tableptr" to point to the Table object + // file length must be exactly "file_size" bytes). If "table_reader_ptr" + // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object // underlying the returned iterator, or nullptr if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the + // the returned iterator. The returned "*table_reader_ptr" object is owned + // by the cache and should not be deleted, and is valid for as long as the // returned iterator is live. // @param range_del_agg If non-nullptr, adds range deletions to the // aggregator. If an error occurs, returns it in a NewErrorInternalIterator + // @param for_compaction If true, a new TableReader may be allocated (but + // not cached), depending on the CF options // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" InternalIterator* NewIterator( @@ -61,11 +76,13 @@ class TableCache { const InternalKey* largest_compaction_key = nullptr); // If a seek to internal key "k" in specified file finds an entry, - // call (*handle_result)(arg, found_key, found_value) repeatedly until - // it returns false. - // @param get_context State for get operation. If its range_del_agg() returns - // non-nullptr, adds range deletions to the aggregator. If an error occurs, - // returns non-ok status. + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param get_context Context for get operation. The result of the lookup + // can be retrieved by calling get_context->State() + // @param file_read_hist If non-nullptr, the file reader statistics are + // recorded // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" Status Get(const ReadOptions& options, @@ -76,6 +93,15 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1); + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param mget_range Pointer to the structure describing a batch of keys to + // be looked up in this table file. The result is stored + // in the embedded GetContext + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" Status MultiGet(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, From a466120cd50a87caf786311beca5684b8dc40eae Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 24 May 2019 15:26:02 -0700 Subject: [PATCH 070/572] improve comments in db_impl_secondary Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5360 Differential Revision: D15502973 Pulled By: miasantreble fbshipit-source-id: 15b7f9d7928e771a6fac0643861173be8ba6b37a --- db/db_impl_secondary.cc | 2 -- db/db_impl_secondary.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 5dfa2d0c942..586158ef7ce 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -120,8 +120,6 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { return s; } -// try to find log reader using log_number from log_readers_ map, initialize -// if it doesn't exist Status DBImplSecondary::MaybeInitLogReader( uint64_t log_number, log::FragmentBufferedReader** log_reader) { auto iter = log_readers_.find(log_number); diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 912708b1ec0..a57835432dc 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -13,6 +13,7 @@ namespace rocksdb { +// A wrapper class to hold log reader, log reporter, log status. class LogReaderContainer { public: LogReaderContainer() @@ -62,11 +63,19 @@ class LogReaderContainer { }; }; +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. class DBImplSecondary : public DBImpl { public: DBImplSecondary(const DBOptions& options, const std::string& dbname); ~DBImplSecondary() override; + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. Status Recover(const std::vector& column_families, bool read_only, bool error_if_log_file_exist, bool error_if_data_exists_in_logs) override; @@ -182,10 +191,15 @@ class DBImplSecondary : public DBImpl { // method can take long time due to all the I/O and CPU costs. Status TryCatchUpWithPrimary() override; + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist Status MaybeInitLogReader(uint64_t log_number, log::FragmentBufferedReader** log_reader); protected: + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs class ColumnFamilyCollector : public WriteBatch::Handler { std::unordered_set column_family_ids_; @@ -262,6 +276,8 @@ class DBImplSecondary : public DBImpl { std::unordered_set* cfds_changed, JobContext* job_context); Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, std::unordered_set* cfds_changed, From b09c018b4d42049de5a9275f2af3c0776b622655 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 24 May 2019 16:55:53 -0700 Subject: [PATCH 071/572] Add comments to trace_replay.h (#5359) Summary: Add file, class, and function level comments in trace_replay.h Pull Request resolved: https://github.com/facebook/rocksdb/pull/5359 Differential Revision: D15505318 Pulled By: sagar0 fbshipit-source-id: 181e3d4ea805fd9a33f91b89e123bbd0c1ead2ce --- util/trace_replay.h | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/util/trace_replay.h b/util/trace_replay.h index 29c00c287b2..d4030c61518 100644 --- a/util/trace_replay.h +++ b/util/trace_replay.h @@ -15,6 +15,9 @@ namespace rocksdb { +// This file contains Tracer and Replayer classes that enable capturing and +// replaying RocksDB traces. + class ColumnFamilyHandle; class ColumnFamilyData; class DB; @@ -29,6 +32,7 @@ const unsigned int kTracePayloadLengthSize = 4; const unsigned int kTraceMetadataSize = kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize; +// Supported Trace types. enum TraceType : char { kTraceBegin = 1, kTraceEnd = 2, @@ -36,13 +40,16 @@ enum TraceType : char { kTraceGet = 4, kTraceIteratorSeek = 5, kTraceIteratorSeekForPrev = 6, + // All trace types should be added before kTraceMax kTraceMax, }; // TODO: This should also be made part of public interface to help users build // custom TracerReaders and TraceWriters. +// +// The data structure that defines a single trace. struct Trace { - uint64_t ts; + uint64_t ts; // timestamp TraceType type; std::string payload; @@ -53,25 +60,47 @@ struct Trace { } }; -// Trace RocksDB operations using a TraceWriter. +// Tracer captures all RocksDB operations using a user-provided TraceWriter. +// Every RocksDB operation is written as a single trace. Each trace will have a +// timestamp and type, followed by the trace payload. class Tracer { public: Tracer(Env* env, const TraceOptions& trace_options, std::unique_ptr&& trace_writer); ~Tracer(); + // Trace all write operations -- Put, Merge, Delete, SingleDelete, Write Status Write(WriteBatch* write_batch); + + // Trace Get operations. Status Get(ColumnFamilyHandle* cfname, const Slice& key); + + // Trace Iterators. Status IteratorSeek(const uint32_t& cf_id, const Slice& key); Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); + + // Returns true if the trace is over the configured max trace file limit. + // False otherwise. bool IsTraceFileOverMax(); + // Writes a trace footer at the end of the tracing Status Close(); private: + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number, trace version, RocksDB version, and + // trace format. Status WriteHeader(); + + // Write a trace footer, typically on ending a trace, with some metadata. Status WriteFooter(); + + // Write a single trace using the provided TraceWriter to the underlying + // system, say, a filesystem or a streaming service. Status WriteTrace(const Trace& trace); + + // Helps in filtering and sampling of traces. + // Returns true if a trace should be skipped, false otherwise. bool ShouldSkipTrace(const TraceType& type); Env* env_; @@ -80,14 +109,24 @@ class Tracer { uint64_t trace_request_count_; }; -// Replay RocksDB operations from a trace. +// Replayer helps to replay the captured RocksDB operations, using a user +// provided TraceReader. +// The Replayer is instantiated via db_bench today, on using "replay" benchmark. class Replayer { public: Replayer(DB* db, const std::vector& handles, std::unique_ptr&& reader); ~Replayer(); + // Replay all the traces from the provided trace stream, taking the delay + // between the traces into consideration. Status Replay(); + + // Enables fast forwarding a replay by reducing the delay between the ingested + // traces. + // fast_forward : Rate of replay speedup. + // If 1, replay the operations at the same rate as in the trace stream. + // If > 1, speed up the replay by this amount. Status SetFastForward(uint32_t fast_forward); private: From bd9f1d2d0ff7ea7beb289cb1ca230f1593ceedae Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 24 May 2019 18:35:11 -0700 Subject: [PATCH 072/572] Fix RocksDB auto-recovery from SpaceLimit err (#5334) Summary: If RocksDB is configured with a positive max_allowed_space (via sst file manager), then the sst file manager should use this value instead of total free disk space to determine whether to clear the background error of space limit reached. In DBSSTTest.DBWithMaxSpaceAllowed, we configure a low space limit that is very likely lower than the free disk space of the test machine. Therefore, once the test db encounters a Status::SpaceLimit, error handler will call into sst file manager to start error recovery which may clear the bg error since disk free space is larger than reserved_disk_buffer_. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5334 Differential Revision: D15501622 Pulled By: riversand963 fbshipit-source-id: 58035efc450b062d6b28c78c322005ec3705fb47 --- util/sst_file_manager_impl.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc index 6a770b106e8..047b0c093d6 100644 --- a/util/sst_file_manager_impl.cc +++ b/util/sst_file_manager_impl.cc @@ -266,6 +266,9 @@ void SstFileManagerImpl::ClearError() { uint64_t free_space; Status s = env_->GetFreeSpace(path_, &free_space); + free_space = max_allowed_space_ > 0 + ? std::min(max_allowed_space_, free_space) + : free_space; if (s.ok()) { // In case of multi-DB instances, some of them may have experienced a // soft error and some a hard error. In the SstFileManagerImpl, a hard From e264eebcd7f5880093b42f13a44c7e67d1619969 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 24 May 2019 20:28:52 -0700 Subject: [PATCH 073/572] Add comments in file_reader_writer.h (#5355) Summary: Add file and class level comments in file_reader_writer.h Pull Request resolved: https://github.com/facebook/rocksdb/pull/5355 Differential Revision: D15499020 Pulled By: sagar0 fbshipit-source-id: 925b2326885cdb4357e6a139ac65ee5e2ce1d613 --- util/file_reader_writer.h | 83 +++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 8 deletions(-) diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 4451f8b81bf..1ef23e8c936 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -6,6 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. + #pragma once #include #include @@ -22,9 +23,22 @@ namespace rocksdb { class Statistics; class HistogramImpl; +// This file provides the following main abstractions: +// SequentialFileReader : wrapper over Env::SequentialFile +// RandomAccessFileReader : wrapper over Env::RandomAccessFile +// WritableFileWriter : wrapper over Env::WritableFile +// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile, +// and ReadOneLine primitives. + +// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to +// always prefetch additional data with every read. This is mainly used in +// Compaction Table Readers. std::unique_ptr NewReadaheadRandomAccessFile( std::unique_ptr&& file, size_t readahead_size); +// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles +// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page +// cache disabled) reads appropriately, and also updates the IO stats. class SequentialFileReader { private: std::unique_ptr file_; @@ -61,6 +75,12 @@ class SequentialFileReader { bool use_direct_io() const { return file_->use_direct_io(); } }; +// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is +// responsible for: +// - Handling Buffered and Direct reads appropriately. +// - Rate limiting compaction reads. +// - Notifying any interested listeners on the completion of a read. +// - Updating IO stats. class RandomAccessFileReader { private: #ifndef ROCKSDB_LITE @@ -151,7 +171,13 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } }; -// Use posix write to write data to a file. +// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides +// facilities to: +// - Handle Buffered and Direct writes. +// - Rate limit writes. +// - Flush and Sync the data to the underlying filesystem. +// - Notify any interested listeners on the completion of a write. +// - Update IO stats. class WritableFileWriter { private: #ifndef ROCKSDB_LITE @@ -277,13 +303,31 @@ class WritableFileWriter { Status SyncInternal(bool use_fsync); }; -// FilePrefetchBuffer can automatically do the readahead if file_reader, -// readahead_size, and max_readahead_size are passed in. -// max_readahead_size should be greater than or equal to readahead_size. -// readahead_size will be doubled on every IO, until max_readahead_size. +// FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: - // If `track_min_offset` is true, track minimum offset ever read. + // Constructor. + // + // All arguments are optional. + // file_reader : the file reader to use. Can be a nullptr. + // readahead_size : the initial readahead size. + // max_readahead_size : the maximum readahead size. + // If max_readahead_size > readahead_size, the readahead size will be + // doubled on every IO until max_readahead_size is hit. + // Typically this is set as a multiple of readahead_size. + // max_readahead_size should be greater than equal to readahead_size. + // enable : controls whether reading from the buffer is enabled. + // If false, TryReadFromCache() always return false, and we only take stats + // for the minimum offset if track_min_offset = true. + // track_min_offset : Track the minimum offset ever read and collect stats on + // it. Used for adaptable readahead of the file footer/metadata. + // + // Automatic readhead is enabled for a file if file_reader, readahead_size, + // and max_readahead_size are passed in. + // If file_reader is a nullptr, setting readadhead_size and max_readahead_size + // does not make any sense. So it does nothing. + // A user can construct a FilePrefetchBuffer without any arguments, but use + // `Prefetch` to load data into the buffer. FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, size_t readadhead_size = 0, size_t max_readahead_size = 0, bool enable = true, bool track_min_offset = false) @@ -294,11 +338,26 @@ class FilePrefetchBuffer { min_offset_read_(port::kMaxSizet), enable_(enable), track_min_offset_(track_min_offset) {} + + // Load data into the buffer from a file. + // reader : the file reader. + // offset : the file offset to start reading from. + // n : the number of bytes to read. Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n); + + // Tries returning the data for a file raed from this buffer, if that data is + // in the buffer. + // It handles tracking the minimum read offset if track_min_offset = true. + // It also does the exponential readahead when readadhead_size is set as part + // of the constructor. + // + // offset : the file offset. + // n : the number of bytes. + // result : output buffer to put the data into. bool TryReadFromCache(uint64_t offset, size_t n, Slice* result); - // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked - // if track_min_offset = true. + // The minimum `offset` ever passed to TryReadFromCache(). This will nly be + // tracked if track_min_offset = true. size_t min_offset_read() const { return min_offset_read_; } private: @@ -317,9 +376,17 @@ class FilePrefetchBuffer { bool track_min_offset_; }; +// Returns a WritableFile. +// +// env : the Env. +// fname : the file name. +// result : output arg. A WritableFile based on `fname` returned. +// options : the Env Options. extern Status NewWritableFile(Env* env, const std::string& fname, std::unique_ptr* result, const EnvOptions& options); + +// Read a single line from a file. bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, std::string* output, bool* has_data, Status* result); From b5e4ee2e763789e23ee2e31e8fc8f82916bafc2d Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 28 May 2019 12:16:22 -0700 Subject: [PATCH 074/572] Fix a clang analyze error (#5365) Summary: The analyzer thinks max_allowed_ space can be 0. In that case, free_space will be assigned as free_space. It fails to realize that the function call GetFreeSpace actually sets the free_space variable properly, which is possibly due to lack of inter-function call analysis. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5365 Differential Revision: D15521790 Pulled By: riversand963 fbshipit-source-id: 839d0a285a1c8773a28a385f0c3be4bb7fbe32cb --- util/sst_file_manager_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc index 047b0c093d6..d85b9c960de 100644 --- a/util/sst_file_manager_impl.cc +++ b/util/sst_file_manager_impl.cc @@ -264,7 +264,7 @@ void SstFileManagerImpl::ClearError() { return; } - uint64_t free_space; + uint64_t free_space = 0; Status s = env_->GetFreeSpace(path_, &free_space); free_space = max_allowed_space_ > 0 ? std::min(max_allowed_space_, free_space) From 4d0c3b1f9644ae5b6a13740075e259268eff40df Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Tue, 28 May 2019 12:18:31 -0700 Subject: [PATCH 075/572] Add comments in compaction_picker.h Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5357 Differential Revision: D15522825 Pulled By: siying fbshipit-source-id: d775386b9d10c7179f5d3af2c821ed213abfacdf --- db/compaction_picker.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 250566b1065..05895a26753 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -24,11 +24,26 @@ namespace rocksdb { +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + class LogBuffer; class Compaction; class VersionStorageInfo; struct CompactionInputFiles; +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. class CompactionPicker { public: CompactionPicker(const ImmutableCFOptions& ioptions, @@ -221,6 +236,9 @@ class CompactionPicker { const InternalKeyComparator* const icmp_; }; +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. class LevelCompactionPicker : public CompactionPicker { public: LevelCompactionPicker(const ImmutableCFOptions& ioptions, @@ -236,6 +254,8 @@ class LevelCompactionPicker : public CompactionPicker { }; #ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. class NullCompactionPicker : public CompactionPicker { public: NullCompactionPicker(const ImmutableCFOptions& ioptions, From f5576c33173f3ef27fe9ba1d71beeb6f1aa15c6a Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Tue, 28 May 2019 14:18:24 -0700 Subject: [PATCH 076/572] WritePrepared: disableWAL in commit without prepare (#5327) Summary: When committing a transaction without prepare, WritePrepared simply writes the batch to db and add the commit entry to CommitCache. When two_write_queues=true, following the rule of committing only from 2nd write queue, the first write, writes the batch and the only thing the 2nd write does is to write the commit entry to CommitCache. Currently the write batch in 2nd write is set to an empty LogData entry, while the write to the WAL could simply be entirely disabled. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5327 Differential Revision: D15424546 Pulled By: maysamyabandeh fbshipit-source-id: 3d9ea3922d5196984c584d62a3ed57e1f7ca7b9f --- .../transactions/pessimistic_transaction_db.cc | 2 +- utilities/transactions/transaction_test.h | 5 +++++ .../write_prepared_transaction_test.cc | 7 ++++--- utilities/transactions/write_prepared_txn_db.cc | 14 +++----------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index c4e6e247756..7b1b0241c97 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -235,7 +235,7 @@ Status TransactionDB::Open( if (txn_db_options.write_policy == WRITE_PREPARED && db_options.unordered_write && !db_options.two_write_queues) { return Status::NotSupported( - "WRITE_UNPREPARED is incompatible with unordered_writes if " + "WRITE_PREPARED is incompatible with unordered_writes if " "two_write_queues is not enabled."); } diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 00fa6cf0364..2e3b9952709 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -214,6 +214,8 @@ class TransactionTestBase : public ::testing::Test { std::atomic exp_seq = {0}; std::atomic commit_writes = {0}; std::atomic expected_commits = {0}; + // Without Prepare, the commit does not write to WAL + std::atomic with_empty_commits = {0}; std::function txn_t0_with_status = [&](size_t index, Status exp_s) { // Test DB's internal txn. It involves no prepare phase nor a commit marker. @@ -231,6 +233,7 @@ class TransactionTestBase : public ::testing::Test { exp_seq++; } } + with_empty_commits++; }; std::function txn_t0 = [&](size_t index) { return txn_t0_with_status(index, Status::OK()); @@ -257,6 +260,7 @@ class TransactionTestBase : public ::testing::Test { } } ASSERT_OK(s); + with_empty_commits++; }; std::function txn_t2 = [&](size_t index) { // Commit without prepare. It should write to DB without a commit marker. @@ -282,6 +286,7 @@ class TransactionTestBase : public ::testing::Test { } } delete txn; + with_empty_commits++; }; std::function txn_t3 = [&](size_t index) { // A full 2pc txn that also involves a commit marker. diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index b93f1a74ffe..7b5a585df91 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1396,6 +1396,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) { } DBImpl* db_impl = reinterpret_cast(db->GetRootDB()); auto seq = db_impl->TEST_GetLastVisibleSequence(); + with_empty_commits = 0; exp_seq = seq; // This is increased before writing the batch for commit commit_writes = 0; @@ -1487,12 +1488,12 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) { assert(db != nullptr); db_impl = reinterpret_cast(db->GetRootDB()); seq = db_impl->TEST_GetLastVisibleSequence(); - ASSERT_LE(exp_seq, seq); + ASSERT_LE(exp_seq, seq + with_empty_commits); // Check if flush preserves the last sequence number db_impl->Flush(fopt); seq = db_impl->GetLatestSequenceNumber(); - ASSERT_LE(exp_seq, seq); + ASSERT_LE(exp_seq, seq + with_empty_commits); // Check if recovery after flush preserves the last sequence number db_impl->FlushWAL(true); @@ -1500,7 +1501,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) { assert(db != nullptr); db_impl = reinterpret_cast(db->GetRootDB()); seq = db_impl->GetLatestSequenceNumber(); - ASSERT_LE(exp_seq, seq); + ASSERT_LE(exp_seq, seq + with_empty_commits); } } diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 3b09cbbf7d6..6b6831fd83b 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -151,11 +151,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; WriteOptions write_options(write_options_orig); - bool sync = write_options.sync; - if (!do_one_write) { - // No need to sync on the first write - write_options.sync = false; - } // In the absence of Prepare markers, use Noop as a batch separator WriteBatchInternal::InsertNoop(batch); const bool DISABLE_MEMTABLE = true; @@ -192,8 +187,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, if (do_one_write) { return s; } // else do the 2nd write for commit - // Set the original value of sync - write_options.sync = sync; ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, "CommitBatchInternal 2nd write prepare_seq: %" PRIu64, prepare_seq); @@ -203,10 +196,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare( this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS); WriteBatch empty_batch; - empty_batch.PutLogData(Slice()); - const size_t ONE_BATCH = 1; - // In the absence of Prepare markers, use Noop as a batch separator - WriteBatchInternal::InsertNoop(&empty_batch); + write_options.disableWAL = true; + write_options.sync = false; + const size_t ONE_BATCH = 1; // Just to inc the seq s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr, no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, &update_commit_map_with_prepare); From eab4f49a2cba969af04a502e561653ca018dba97 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Tue, 28 May 2019 16:26:14 -0700 Subject: [PATCH 077/572] WritePrepared: skip_concurrency_control option (#5330) Summary: This enables the user to set TransactionDBOptions::skip_concurrency_control so the standard `DB::Write(const WriteOptions& opts, WriteBatch* updates)` would skip the concurrency control. This would give higher throughput to the users who know their use case doesn't need concurrency control. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5330 Differential Revision: D15525932 Pulled By: maysamyabandeh fbshipit-source-id: 68421ac1ba34f549a4a8de9ce4c2dccf6fb4b06b --- include/rocksdb/utilities/transaction_db.h | 7 +++++ tools/db_bench_tool.cc | 10 +++++++ .../pessimistic_transaction_db.cc | 27 +++++++------------ .../transactions/pessimistic_transaction_db.h | 22 +++++++++++++++ .../transactions/write_prepared_txn_db.cc | 14 +++++++++- .../transactions/write_prepared_txn_db.h | 3 +++ 6 files changed, 65 insertions(+), 18 deletions(-) diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 6c4346ff3e7..db32ba0bc3a 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -94,6 +94,13 @@ struct TransactionDBOptions { // for the special way that myrocks uses this operands. bool rollback_merge_operands = false; + // If true, the TransactionDB implementation might skip concurrency control + // unless it is overridden by TransactionOptions or + // TransactionDBWriteOptimizations. This can be used in conjuction with + // DBOptions::unordered_write when the TransactionDB is used solely for write + // ordering rather than concurrency control. + bool skip_concurrency_control = false; + private: // 128 entries size_t wp_snapshot_cache_bits = static_cast(7); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 18d8733439b..2ceca4fd950 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -3788,6 +3788,11 @@ void VerifyDBFromDB(std::string& truth_db_name) { } else if (FLAGS_transaction_db) { TransactionDB* ptr; TransactionDBOptions txn_db_options; + if (options.unordered_write) { + options.two_write_queues = true; + txn_db_options.skip_concurrency_control = true; + txn_db_options.write_policy = WRITE_PREPARED; + } s = TransactionDB::Open(options, txn_db_options, db_name, column_families, &db->cfh, &ptr); if (s.ok()) { @@ -3814,6 +3819,11 @@ void VerifyDBFromDB(std::string& truth_db_name) { } else if (FLAGS_transaction_db) { TransactionDB* ptr = nullptr; TransactionDBOptions txn_db_options; + if (options.unordered_write) { + options.two_write_queues = true; + txn_db_options.skip_concurrency_control = true; + txn_db_options.write_policy = WRITE_PREPARED; + } s = CreateLoggerFromOptions(db_name, options, &options.info_log); if (s.ok()) { s = TransactionDB::Open(options, txn_db_options, db_name, &ptr); diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 7b1b0241c97..c1b37c148f5 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -522,23 +522,16 @@ Status PessimisticTransactionDB::Merge(const WriteOptions& options, Status PessimisticTransactionDB::Write(const WriteOptions& opts, WriteBatch* updates) { - // Need to lock all keys in this batch to prevent write conflicts with - // concurrent transactions. - Transaction* txn = BeginInternalTransaction(opts); - txn->DisableIndexing(); - - auto txn_impl = - static_cast_with_check(txn); - - // Since commitBatch sorts the keys before locking, concurrent Write() - // operations will not cause a deadlock. - // In order to avoid a deadlock with a concurrent Transaction, Transactions - // should use a lock timeout. - Status s = txn_impl->CommitBatch(updates); - - delete txn; + return WriteWithConcurrencyControl(opts, updates); +} - return s; +Status WriteCommittedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + if (txn_db_options_.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } } Status WriteCommittedTxnDB::Write( @@ -547,7 +540,7 @@ Status WriteCommittedTxnDB::Write( if (optimizations.skip_concurrency_control) { return db_impl_->Write(opts, updates); } else { - return Write(opts, updates); + return WriteWithConcurrencyControl(opts, updates); } } diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h index e80b28852e7..5242c6260b1 100644 --- a/utilities/transactions/pessimistic_transaction_db.h +++ b/utilities/transactions/pessimistic_transaction_db.h @@ -19,6 +19,7 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/transaction_lock_mgr.h" #include "utilities/transactions/write_prepared_txn.h" @@ -67,6 +68,26 @@ class PessimisticTransactionDB : public TransactionDB { using TransactionDB::Write; virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + inline Status WriteWithConcurrencyControl(const WriteOptions& opts, + WriteBatch* updates) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + txn->DisableIndexing(); + + auto txn_impl = + static_cast_with_check(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, Transactions + // should use a lock timeout. + Status s = txn_impl->CommitBatch(updates); + + delete txn; + + return s; + } using StackableDB::CreateColumnFamily; virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, @@ -191,6 +212,7 @@ class WriteCommittedTxnDB : public PessimisticTransactionDB { virtual Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) override; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; }; } // namespace rocksdb diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 6b6831fd83b..5250f3f2de5 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -108,6 +108,18 @@ Transaction* WritePreparedTxnDB::BeginTransaction( } } +Status WritePreparedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + if (txn_db_options_.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN); + } else { + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + Status WritePreparedTxnDB::Write( const WriteOptions& opts, const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { @@ -123,7 +135,7 @@ Status WritePreparedTxnDB::Write( } else { // TODO(myabandeh): Make use of skip_duplicate_key_check hint // Fall back to unoptimized version - return PessimisticTransactionDB::Write(opts, updates); + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); } } diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 25b9b9a1b05..ffdf2f29d8f 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -72,6 +72,9 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { const TransactionOptions& txn_options, Transaction* old_txn) override; + using TransactionDB::Write; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; + // Optimized version of ::Write that receives more optimization request such // as skip_concurrency_control. using PessimisticTransactionDB::Write; From 545d20604084993174f1c0680deeff33bc67a553 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Wed, 29 May 2019 20:44:08 -0700 Subject: [PATCH 078/572] Move some file related files outside util/ (#5375) Summary: util/ means for lower level libraries, so it's a good idea to move the files which requires knowledge to DB out. Create a file/ and move some files there. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5375 Differential Revision: D15550935 Pulled By: siying fbshipit-source-id: 61a9715dcde5386eebfb43e93f847bba1ae0d3f2 --- CMakeLists.txt | 10 +++++----- Makefile | 2 +- TARGETS | 10 +++++----- db/builder.cc | 2 +- db/column_family.cc | 2 +- db/compaction_job.cc | 4 ++-- db/compaction_job_stats_test.cc | 2 +- db/compaction_picker.cc | 2 +- db/compaction_picker_universal.cc | 2 +- db/corruption_test.cc | 2 +- db/db_filesnapshot.cc | 4 ++-- db/db_impl.cc | 6 +++--- db/db_impl_compaction_flush.cc | 2 +- db/db_impl_files.cc | 4 ++-- db/db_impl_open.cc | 2 +- db/db_info_dumper.cc | 2 +- db/db_iter.cc | 2 +- db/db_sst_test.cc | 2 +- db/db_test.cc | 2 +- db/db_test_util.h | 2 +- db/deletefile_test.cc | 2 +- db/error_handler.cc | 2 +- db/external_sst_file_ingestion_job.cc | 2 +- db/external_sst_file_test.cc | 2 +- db/fault_injection_test.cc | 2 +- db/filename_test.cc | 2 +- db/flush_job.cc | 4 ++-- db/listener_test.cc | 2 +- db/memtable_list.h | 2 +- db/obsolete_files_test.cc | 2 +- db/plain_table_db_test.cc | 2 +- db/repair.cc | 2 +- db/repair_test.cc | 2 +- db/table_cache.cc | 2 +- db/transaction_log_impl.h | 2 +- db/version_set.cc | 2 +- db/wal_manager.cc | 4 ++-- {util => file}/delete_scheduler.cc | 4 ++-- {util => file}/delete_scheduler.h | 0 {util => file}/delete_scheduler_test.cc | 4 ++-- {util => file}/file_util.cc | 4 ++-- {util => file}/file_util.h | 2 +- {util => file}/filename.cc | 2 +- {util => file}/filename.h | 0 {util => file}/sst_file_manager_impl.cc | 2 +- {util => file}/sst_file_manager_impl.h | 2 +- src.mk | 8 ++++---- tools/ldb_cmd.cc | 2 +- tools/write_stress.cc | 2 +- util/auto_roll_logger.h | 2 +- util/fault_injection_test_env.h | 2 +- utilities/backupable/backupable_db.cc | 2 +- utilities/backupable/backupable_db_test.cc | 2 +- utilities/blob_db/blob_db_impl.cc | 6 +++--- utilities/blob_db/blob_db_impl_filesnapshot.cc | 2 +- utilities/blob_db/blob_db_test.cc | 4 ++-- utilities/blob_db/blob_file.cc | 2 +- utilities/checkpoint/checkpoint_impl.cc | 4 ++-- utilities/checkpoint/checkpoint_impl.h | 2 +- utilities/convenience/info_log_finder.cc | 2 +- utilities/options/options_util.cc | 2 +- utilities/ttl/db_ttl_impl.cc | 2 +- 62 files changed, 85 insertions(+), 85 deletions(-) rename {util => file}/delete_scheduler.cc (99%) rename {util => file}/delete_scheduler.h (100%) rename {util => file}/delete_scheduler_test.cc (99%) rename {util => file}/file_util.cc (97%) rename {util => file}/file_util.h (97%) rename {util => file}/filename.cc (99%) rename {util => file}/filename.h (100%) rename {util => file}/sst_file_manager_impl.cc (99%) rename {util => file}/sst_file_manager_impl.h (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bb0c089f2e..4d74152d9d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -538,6 +538,10 @@ set(SOURCES env/env_encryption.cc env/env_hdfs.cc env/mock_env.cc + file/delete_scheduler.cc + file/file_util.cc + file/filename.cc + file/sst_file_manager_impl.cc memtable/alloc_tracker.cc memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc @@ -612,12 +616,9 @@ set(SOURCES util/concurrent_arena.cc util/concurrent_task_limiter_impl.cc util/crc32c.cc - util/delete_scheduler.cc util/dynamic_bloom.cc util/event_logger.cc util/file_reader_writer.cc - util/file_util.cc - util/filename.cc util/filter_policy.cc util/hash.cc util/jemalloc_nodump_allocator.cc @@ -626,7 +627,6 @@ set(SOURCES util/random.cc util/rate_limiter.cc util/slice.cc - util/sst_file_manager_impl.cc util/status.cc util/string_util.cc util/sync_point.cc @@ -931,6 +931,7 @@ if(WITH_TESTS) env/env_basic_test.cc env/env_test.cc env/mock_env_test.cc + file/delete_scheduler_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc memtable/write_buffer_manager_test.cc @@ -959,7 +960,6 @@ if(WITH_TESTS) util/bloom_test.cc util/coding_test.cc util/crc32c_test.cc - util/delete_scheduler_test.cc util/dynamic_bloom_test.cc util/event_logger_test.cc util/file_reader_writer_test.cc diff --git a/Makefile b/Makefile index ee20a41bb1a..ec0a04ed106 100644 --- a/Makefile +++ b/Makefile @@ -1369,7 +1369,7 @@ fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS) +delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/TARGETS b/TARGETS index 073c977e5ad..7d271515728 100644 --- a/TARGETS +++ b/TARGETS @@ -143,6 +143,10 @@ cpp_library( "env/env_posix.cc", "env/io_posix.cc", "env/mock_env.cc", + "file/delete_scheduler.cc", + "file/file_util.cc", + "file/filename.cc", + "file/sst_file_manager_impl.cc", "memtable/alloc_tracker.cc", "memtable/hash_linklist_rep.cc", "memtable/hash_skiplist_rep.cc", @@ -218,12 +222,9 @@ cpp_library( "util/concurrent_arena.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", - "util/delete_scheduler.cc", "util/dynamic_bloom.cc", "util/event_logger.cc", "util/file_reader_writer.cc", - "util/file_util.cc", - "util/filename.cc", "util/filter_policy.cc", "util/hash.cc", "util/jemalloc_nodump_allocator.cc", @@ -232,7 +233,6 @@ cpp_library( "util/random.cc", "util/rate_limiter.cc", "util/slice.cc", - "util/sst_file_manager_impl.cc", "util/status.cc", "util/string_util.cc", "util/sync_point.cc", @@ -663,7 +663,7 @@ ROCKS_TESTS = [ ], [ "delete_scheduler_test", - "util/delete_scheduler_test.cc", + "file/delete_scheduler_test.cc", "serial", ], [ diff --git a/db/builder.cc b/db/builder.cc index 7f2fd72a191..b42ac187ef0 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -21,6 +21,7 @@ #include "db/range_del_aggregator.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "file/filename.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_util.h" #include "rocksdb/db.h" @@ -32,7 +33,6 @@ #include "table/format.h" #include "table/internal_iterator.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/stop_watch.h" #include "util/sync_point.h" diff --git a/db/column_family.cc b/db/column_family.cc index 4592c945f2e..325610b8844 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -29,6 +29,7 @@ #include "db/table_properties_collector.h" #include "db/version_set.h" #include "db/write_controller.h" +#include "file/sst_file_manager_impl.h" #include "memtable/hash_skiplist_rep.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" @@ -36,7 +37,6 @@ #include "table/merging_iterator.h" #include "util/autovector.h" #include "util/compression.h" -#include "util/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 44fb385d1b3..7d2015e5629 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -38,6 +38,8 @@ #include "db/merge_helper.h" #include "db/range_del_aggregator.h" #include "db/version_set.h" +#include "file/filename.h" +#include "file/sst_file_manager_impl.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -53,12 +55,10 @@ #include "table/table_builder.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/sync_point.h" diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index 48e883bc6cc..5ca6bf4a337 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -27,6 +27,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "env/mock_env.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "monitoring/thread_status_util.h" @@ -52,7 +53,6 @@ #include "table/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" -#include "util/filename.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 4bd8ff0e33a..f500def41ee 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -20,8 +20,8 @@ #include #include #include "db/column_family.h" +#include "file/filename.h" #include "monitoring/statistics.h" -#include "util/filename.h" #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc index 9291178585a..c25ae94fa1b 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction_picker_universal.cc @@ -20,8 +20,8 @@ #include #include #include "db/column_family.h" +#include "file/filename.h" #include "monitoring/statistics.h" -#include "util/filename.h" #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 1ccb1aa2b09..ba97ca1502b 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -19,6 +19,7 @@ #include "db/db_impl.h" #include "db/log_format.h" #include "db/version_set.h" +#include "file/filename.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" @@ -26,7 +27,6 @@ #include "rocksdb/write_batch.h" #include "table/block_based_table_builder.h" #include "table/meta_blocks.h" -#include "util/filename.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index ace0befb6d5..5b630e21635 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -17,11 +17,11 @@ #include "db/db_impl.h" #include "db/job_context.h" #include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/sync_point.h" diff --git a/db/db_impl.cc b/db/db_impl.cc index 3ec9e2ab2d6..e7ed1866469 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -53,6 +53,9 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sst_file_manager_impl.h" #include "memtable/hash_linklist_rep.h" #include "memtable/hash_skiplist_rep.h" #include "monitoring/iostats_context_imp.h" @@ -89,12 +92,9 @@ #include "util/compression.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/sync_point.h" diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 38c69dfc1e4..1e39bdd4271 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -16,12 +16,12 @@ #include "db/builder.h" #include "db/error_handler.h" #include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" #include "util/concurrent_task_limiter_impl.h" -#include "util/sst_file_manager_impl.h" #include "util/sync_point.h" namespace rocksdb { diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc index b16cf87947d..64c6dc96879 100644 --- a/db/db_impl_files.cc +++ b/db/db_impl_files.cc @@ -16,8 +16,8 @@ #include #include "db/event_helpers.h" #include "db/memtable_list.h" -#include "util/file_util.h" -#include "util/sst_file_manager_impl.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 66104d0ba28..4240b2012dc 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -15,11 +15,11 @@ #include "db/builder.h" #include "db/error_handler.h" +#include "file/sst_file_manager_impl.h" #include "options/options_helper.h" #include "rocksdb/wal_filter.h" #include "table/block_based_table_factory.h" #include "util/rate_limiter.h" -#include "util/sst_file_manager_impl.h" #include "util/sync_point.h" namespace rocksdb { diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index 31050d20a29..be85357c2e1 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -15,8 +15,8 @@ #include #include +#include "file/filename.h" #include "rocksdb/env.h" -#include "util/filename.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index a606e3acd66..8fc17e1446e 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -16,6 +16,7 @@ #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" +#include "file/filename.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -24,7 +25,6 @@ #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "util/arena.h" -#include "util/filename.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 9003ed6b1ac..815aed23e0e 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -8,10 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_manager.h" -#include "util/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/db_test.cc b/db/db_test.cc index 8a112e48fcd..7864a7e2c65 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -31,6 +31,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "env/mock_env.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/thread_status_util.h" #include "port/port.h" @@ -59,7 +60,6 @@ #include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" diff --git a/db/db_test_util.h b/db/db_test_util.h index f5d7fd1a75f..81186bfb9ad 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -27,6 +27,7 @@ #include "db/db_impl.h" #include "db/dbformat.h" #include "env/mock_env.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -45,7 +46,6 @@ #include "table/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" -#include "util/filename.h" #include "util/mock_time_env.h" #include "util/mutexlock.h" diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 54bab847927..81ff8d0b99f 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -16,10 +16,10 @@ #include "db/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/filename.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" diff --git a/db/error_handler.cc b/db/error_handler.cc index afec14edcbe..140fb4850f6 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -6,7 +6,7 @@ #include "db/error_handler.h" #include "db/db_impl.h" #include "db/event_helpers.h" -#include "util/sst_file_manager_impl.h" +#include "file/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 588ac5110a2..7bfc64f77cb 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -17,12 +17,12 @@ #include #include "db/version_edit.h" +#include "file/file_util.h" #include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" #include "util/file_reader_writer.h" -#include "util/file_util.h" #include "util/stop_watch.h" #include "util/sync_point.h" diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 3850a2a031e..0a0994f0ea9 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -7,11 +7,11 @@ #include #include "db/db_test_util.h" +#include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" #include "util/fault_injection_test_env.h" -#include "util/filename.h" #include "util/testutil.h" namespace rocksdb { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 53de312c017..1bfaa299456 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -15,13 +15,13 @@ #include "db/log_format.h" #include "db/version_set.h" #include "env/mock_env.h" +#include "file/filename.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" #include "util/fault_injection_test_env.h" -#include "util/filename.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/sync_point.h" diff --git a/db/filename_test.cc b/db/filename_test.cc index d6bde52834e..869469f3f0c 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/filename.h" +#include "file/filename.h" #include "db/dbformat.h" #include "port/port.h" diff --git a/db/flush_job.cc b/db/flush_job.cc index 21c1ff3a746..46915ca13a8 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -29,6 +29,8 @@ #include "db/merge_context.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -45,8 +47,6 @@ #include "table/two_level_iterator.h" #include "util/coding.h" #include "util/event_logger.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/listener_test.cc b/db/listener_test.cc index 56968d8f803..6b716a1d4b1 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -8,6 +8,7 @@ #include "db/dbformat.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "rocksdb/cache.h" @@ -23,7 +24,6 @@ #include "rocksdb/table_properties.h" #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" -#include "util/filename.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/memtable_list.h b/db/memtable_list.h index 5df35660a4d..a5f0c123292 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -16,13 +16,13 @@ #include "db/logs_with_prep_tracker.h" #include "db/memtable.h" #include "db/range_del_aggregator.h" +#include "file/filename.h" #include "monitoring/instrumented_mutex.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/types.h" #include "util/autovector.h" -#include "util/filename.h" #include "util/log_buffer.h" namespace rocksdb { diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 52175a07b74..6bf2acf8519 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -16,10 +16,10 @@ #include "db/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/filename.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 8a08cf9fede..ef770c2e50b 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -15,6 +15,7 @@ #include "db/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -28,7 +29,6 @@ #include "table/plain_table_key_coding.h" #include "table/plain_table_reader.h" #include "table/table_builder.h" -#include "util/filename.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/repair.cc b/db/repair.cc index 2715adcf129..577c122bcf9 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -74,6 +74,7 @@ #include "db/table_cache.h" #include "db/version_edit.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -82,7 +83,6 @@ #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/repair_test.cc b/db/repair_test.cc index 3422532da4b..1851cde0dfc 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -11,10 +11,10 @@ #include "db/db_impl.h" #include "db/db_test_util.h" +#include "file/file_util.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/transaction_log.h" -#include "util/file_util.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/table_cache.cc b/db/table_cache.cc index 06255d6a354..01724dfc5cb 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -12,7 +12,7 @@ #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_edit.h" -#include "util/filename.h" +#include "file/filename.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/statistics.h" diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 6382b61a5b7..68ba620714c 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -9,13 +9,13 @@ #include "db/log_reader.h" #include "db/version_set.h" +#include "file/filename.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/transaction_log.h" #include "rocksdb/types.h" -#include "util/filename.h" namespace rocksdb { diff --git a/db/version_set.cc b/db/version_set.cc index 5723c6d9253..c10eb9f7ac3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -33,6 +33,7 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "file/filename.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" @@ -49,7 +50,6 @@ #include "table/two_level_iterator.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/sync_point.h" diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 62511819e4d..cce714750e7 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -22,6 +22,8 @@ #include "db/log_writer.h" #include "db/transaction_log_impl.h" #include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/options.h" @@ -29,8 +31,6 @@ #include "util/cast_util.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/util/delete_scheduler.cc b/file/delete_scheduler.cc similarity index 99% rename from util/delete_scheduler.cc rename to file/delete_scheduler.cc index f5ee2844896..41ec84376b6 100644 --- a/util/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -5,16 +5,16 @@ #ifndef ROCKSDB_LITE -#include "util/delete_scheduler.h" +#include "file/delete_scheduler.h" #include #include +#include "file/sst_file_manager_impl.h" #include "port/port.h" #include "rocksdb/env.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "util/sst_file_manager_impl.h" #include "util/sync_point.h" namespace rocksdb { diff --git a/util/delete_scheduler.h b/file/delete_scheduler.h similarity index 100% rename from util/delete_scheduler.h rename to file/delete_scheduler.h diff --git a/util/delete_scheduler_test.cc b/file/delete_scheduler_test.cc similarity index 99% rename from util/delete_scheduler_test.cc rename to file/delete_scheduler_test.cc index 0d8e354b9c0..c8544004cd5 100644 --- a/util/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -12,10 +12,10 @@ #include #include +#include "file/delete_scheduler.h" +#include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "util/delete_scheduler.h" -#include "util/sst_file_manager_impl.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" diff --git a/util/file_util.cc b/file/file_util.cc similarity index 97% rename from util/file_util.cc rename to file/file_util.cc index ba1b4744bbb..0364f834022 100644 --- a/util/file_util.cc +++ b/file/file_util.cc @@ -3,13 +3,13 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "util/file_util.h" +#include "file/file_util.h" #include #include +#include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" -#include "util/sst_file_manager_impl.h" #include "util/file_reader_writer.h" namespace rocksdb { diff --git a/util/file_util.h b/file/file_util.h similarity index 97% rename from util/file_util.h rename to file/file_util.h index c3b365c8bc3..9116c1fecfb 100644 --- a/util/file_util.h +++ b/file/file_util.h @@ -6,11 +6,11 @@ #pragma once #include +#include "file/filename.h" #include "options/db_options.h" #include "rocksdb/env.h" #include "rocksdb/status.h" #include "rocksdb/types.h" -#include "util/filename.h" namespace rocksdb { // use_fsync maps to options.use_fsync, which determines the way that diff --git a/util/filename.cc b/file/filename.cc similarity index 99% rename from util/filename.cc rename to file/filename.cc index 32289aecb4b..0a48dc78c36 100644 --- a/util/filename.cc +++ b/file/filename.cc @@ -10,7 +10,7 @@ #define __STDC_FORMAT_MACROS #endif -#include "util/filename.h" +#include "file/filename.h" #include #include diff --git a/util/filename.h b/file/filename.h similarity index 100% rename from util/filename.h rename to file/filename.h diff --git a/util/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc similarity index 99% rename from util/sst_file_manager_impl.cc rename to file/sst_file_manager_impl.cc index d85b9c960de..86bcb2d19ca 100644 --- a/util/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sst_file_manager_impl.h" +#include "file/sst_file_manager_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/util/sst_file_manager_impl.h b/file/sst_file_manager_impl.h similarity index 99% rename from util/sst_file_manager_impl.h rename to file/sst_file_manager_impl.h index 211b4fa7160..b506ece2796 100644 --- a/util/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -13,8 +13,8 @@ #include "db/compaction.h" #include "db/error_handler.h" +#include "file/delete_scheduler.h" #include "rocksdb/sst_file_manager.h" -#include "util/delete_scheduler.h" namespace rocksdb { diff --git a/src.mk b/src.mk index e3fe5632f87..2541b9fd12b 100644 --- a/src.mk +++ b/src.mk @@ -67,6 +67,10 @@ LIB_SOURCES = \ env/env_posix.cc \ env/io_posix.cc \ env/mock_env.cc \ + file/delete_scheduler.cc \ + file/file_util.cc \ + file/filename.cc \ + file/sst_file_manager_impl.cc \ memtable/alloc_tracker.cc \ memtable/hash_linklist_rep.cc \ memtable/hash_skiplist_rep.cc \ @@ -139,12 +143,9 @@ LIB_SOURCES = \ util/concurrent_arena.cc \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ - util/delete_scheduler.cc \ util/dynamic_bloom.cc \ util/event_logger.cc \ util/file_reader_writer.cc \ - util/file_util.cc \ - util/filename.cc \ util/filter_policy.cc \ util/hash.cc \ util/jemalloc_nodump_allocator.cc \ @@ -153,7 +154,6 @@ LIB_SOURCES = \ util/random.cc \ util/rate_limiter.cc \ util/slice.cc \ - util/sst_file_manager_impl.cc \ util/status.cc \ util/string_util.cc \ util/sync_point.cc \ diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index d05ae4a5810..10e9a495d23 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -16,6 +16,7 @@ #include "db/dbformat.h" #include "db/log_reader.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "port/port_dirent.h" #include "rocksdb/cache.h" #include "rocksdb/table_properties.h" @@ -31,7 +32,6 @@ #include "tools/sst_dump_tool_imp.h" #include "util/cast_util.h" #include "util/coding.h" -#include "util/filename.h" #include "util/stderr_logger.h" #include "util/string_util.h" #include "utilities/ttl/db_ttl_impl.h" diff --git a/tools/write_stress.cc b/tools/write_stress.cc index ddb1d0aed03..8cde31e6b84 100644 --- a/tools/write_stress.cc +++ b/tools/write_stress.cc @@ -67,12 +67,12 @@ int main() { #include #include +#include "file/filename.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "util/filename.h" #include "util/gflags_compat.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index 64fce4d63e7..24f4714b4fd 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -10,9 +10,9 @@ #include #include +#include "file/filename.h" #include "port/port.h" #include "port/util_logger.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/sync_point.h" diff --git a/util/fault_injection_test_env.h b/util/fault_injection_test_env.h index a39e5b71e9d..d962acfd585 100644 --- a/util/fault_injection_test_env.h +++ b/util/fault_injection_test_env.h @@ -19,9 +19,9 @@ #include "db/version_set.h" #include "env/mock_env.h" +#include "file/filename.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/random.h" diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index b7c15c39150..149eb911f7f 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -10,6 +10,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/backupable_db.h" +#include "file/filename.h" #include "port/port.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/transaction_log.h" @@ -17,7 +18,6 @@ #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/logging.h" #include "util/string_util.h" #include "util/sync_point.h" diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 1548203dd0a..e4abd96e95f 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -14,6 +14,7 @@ #include "db/db_impl.h" #include "env/env_chroot.h" +#include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/rate_limiter.h" @@ -22,7 +23,6 @@ #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/options_util.h" #include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stderr_logger.h" diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 5dcddc214c8..9f3839370eb 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -12,6 +12,9 @@ #include "db/db_impl.h" #include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sst_file_manager_impl.h" #include "monitoring/instrumented_mutex.h" #include "monitoring/statistics.h" #include "rocksdb/convenience.h" @@ -26,12 +29,9 @@ #include "util/cast_util.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/sync_point.h" #include "util/timer_queue.h" diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc index 8effe88c0a6..16b9ff826e6 100644 --- a/utilities/blob_db/blob_db_impl_filesnapshot.cc +++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -7,7 +7,7 @@ #include "utilities/blob_db/blob_db_impl.h" -#include "util/filename.h" +#include "file/filename.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index afb953df9c5..e24ba1d983c 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -14,13 +14,13 @@ #include #include "db/db_test_util.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" #include "port/port.h" #include "rocksdb/utilities/debug.h" #include "util/cast_util.h" #include "util/fault_injection_test_env.h" -#include "util/file_util.h" #include "util/random.h" -#include "util/sst_file_manager_impl.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 3bcbd048734..e14307d44cd 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -19,7 +19,7 @@ #include "db/column_family.h" #include "db/db_impl.h" #include "db/dbformat.h" -#include "util/filename.h" +#include "file/filename.h" #include "util/logging.h" #include "utilities/blob_db/blob_db_impl.h" diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 9863ac1d564..920f9bf535b 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -21,13 +21,13 @@ #include #include "db/wal_manager.h" +#include "file/file_util.h" +#include "file/filename.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" #include "rocksdb/utilities/checkpoint.h" -#include "util/file_util.h" -#include "util/filename.h" #include "util/sync_point.h" namespace rocksdb { diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index a85fde59b60..d26a9f66bfc 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -9,8 +9,8 @@ #include "rocksdb/utilities/checkpoint.h" #include +#include "file/filename.h" #include "rocksdb/db.h" -#include "util/filename.h" namespace rocksdb { diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc index 72c4a6275ae..3e599961630 100644 --- a/utilities/convenience/info_log_finder.cc +++ b/utilities/convenience/info_log_finder.cc @@ -8,8 +8,8 @@ // found in the LICENSE file. #include "rocksdb/utilities/info_log_finder.h" +#include "file/filename.h" #include "rocksdb/env.h" -#include "util/filename.h" namespace rocksdb { diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc index 3975eadd755..561e925ebbe 100644 --- a/utilities/options/options_util.cc +++ b/utilities/options/options_util.cc @@ -7,9 +7,9 @@ #include "rocksdb/utilities/options_util.h" +#include "file/filename.h" #include "options/options_parser.h" #include "rocksdb/options.h" -#include "util/filename.h" namespace rocksdb { Status LoadOptionsFromFile(const std::string& file_name, Env* env, diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 1952e6188d6..47049a13585 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -7,12 +7,12 @@ #include "utilities/ttl/db_ttl_impl.h" #include "db/write_batch_internal.h" +#include "file/filename.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/utilities/db_ttl.h" #include "util/coding.h" -#include "util/filename.h" namespace rocksdb { From 87fe4bcab857c38a22ebecfb6e7d0e5a8d9a0864 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 30 May 2019 10:43:34 -0700 Subject: [PATCH 079/572] Fix FIFO dynamic options sanitization (#5367) Summary: When dynamically setting options, we check the option type info and skip options that are marked deprecated. However this check is only done at top level, which results in bugs where SetOptions will corrupt option values and cause unexpected system behavior iff a deprecated second level option is set dynamically. For exmaple, the following call: ``` dbfull()->SetOptions( {{"compaction_options_fifo", "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}}); ``` was from pre 6.0 release when `ttl` was part of `compaction_options_fifo`. Now that it got moved out of `compaction_options_fifo`, this call will incorrectly set `compaction_options_fifo.max_table_files_size` to 731 (as `max_table_files_size` is the first one in `OptionsHelper::fifo_compaction_options_type_info` struct) and cause files to gett evicted much faster than expected. This PR adds verification to second level options like `compaction_options_fifo.ttl` or `compaction_options_fifo.max_table_files_size` when set dynamically, and filter out those marked as deprecated. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5367 Differential Revision: D15530998 Pulled By: miasantreble fbshipit-source-id: 818258be5c3abe09cd82d62f3c083572d70fecdd --- db/db_options_test.cc | 47 +++++++++++++++++++++++++++++++++++++++ options/options_helper.cc | 5 +++++ 2 files changed, 52 insertions(+) diff --git a/db/db_options_test.cc b/db/db_options_test.cc index a7ecf12744b..cb9a0e02e61 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -1007,6 +1007,53 @@ TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { ASSERT_EQ(256, env_->compaction_readahead_size_); Close(); } + +TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.create_if_missing = true; + + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // In release 6.0, ttl was promoted from a secondary level option under + // compaction_options_fifo to a top level option under ColumnFamilyOptions. + // We still need to handle old SetOptions calls but should ignore + // ttl under compaction_options_fifo. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}, + {"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + + // Put ttl as the first option inside compaction_options_fifo. That works as + // it doesn't overwrite any other option. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"}, + {"ttl", "191"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 191); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/options/options_helper.cc b/options/options_helper.cc index dbee1636d9f..82e7a1fa13a 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -372,6 +372,11 @@ bool ParseSingleStructOption( return false; } const auto& opt_info = iter->second; + if (opt_info.verification == OptionVerificationType::kDeprecated) { + // Should also skip deprecated sub-options such as + // fifo_compaction_options_type_info.ttl + return true; + } return ParseOptionHelper( reinterpret_cast(options) + opt_info.mutable_offset, opt_info.type, value); From a984040f0bf205cb102cfbc377f8c9e44aff0300 Mon Sep 17 00:00:00 2001 From: anand76 Date: Thu, 30 May 2019 11:08:35 -0700 Subject: [PATCH 080/572] Increase Trash/DB size ratio in DBSSTTest.RateLimitedWALDelete (#5366) Summary: By increasing the ratio, we ensure that all files go through background deletion and eliminate flakiness due to timing of deletions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5366 Differential Revision: D15549992 Pulled By: anand1976 fbshipit-source-id: d137375cd791fc1a802841412755d6e2b8fd7688 --- db/db_sst_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 815aed23e0e..799d0e14f6b 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -430,6 +430,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { env_->time_elapse_only_sleep_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.compression = kNoCompression; options.env = env_; int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec @@ -439,7 +440,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { ASSERT_OK(s); options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); auto sfm = static_cast(options.sst_file_manager.get()); - sfm->delete_scheduler()->SetMaxTrashDBRatio(2.1); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); ASSERT_OK(TryReopen(options)); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); From e9e0101ca46f00e8a456e69912a913d907be56fc Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 30 May 2019 11:21:38 -0700 Subject: [PATCH 081/572] Move test related files under util/ to test_util/ (#5377) Summary: There are too many types of files under util/. Some test related files don't belong to there or just are just loosely related. Mo ve them to a new directory test_util/, so that util/ is cleaner. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5377 Differential Revision: D15551366 Pulled By: siying fbshipit-source-id: 0f5c8653832354ef8caa31749c0143815d719e2c --- CMakeLists.txt | 12 ++++++------ Makefile | 4 ++-- TARGETS | 14 +++++++------- buckifier/buckify_rocksdb.py | 2 +- cache/cache_test.cc | 2 +- cache/lru_cache_test.cc | 2 +- db/builder.cc | 2 +- db/column_family_test.cc | 8 ++++---- db/compact_files_test.cc | 4 ++-- db/compaction.cc | 2 +- db/compaction_iterator.cc | 2 +- db/compaction_iterator_test.cc | 4 ++-- db/compaction_job.cc | 2 +- db/compaction_job_stats_test.cc | 6 +++--- db/compaction_job_test.cc | 4 ++-- db/compaction_picker.cc | 2 +- db/compaction_picker_test.cc | 4 ++-- db/compaction_picker_universal.cc | 2 +- db/comparator_db_test.cc | 4 ++-- db/corruption_test.cc | 4 ++-- db/cuckoo_table_db_test.cc | 4 ++-- db/db_basic_test.cc | 4 ++-- db/db_compaction_test.cc | 4 ++-- db/db_encryption_test.cc | 2 +- db/db_filesnapshot.cc | 2 +- db/db_flush_test.cc | 4 ++-- db/db_impl.cc | 2 +- db/db_impl_compaction_flush.cc | 2 +- db/db_impl_open.cc | 2 +- db/db_impl_write.cc | 2 +- db/db_iter_stress_test.cc | 2 +- db/db_iter_test.cc | 4 ++-- db/db_options_test.cc | 4 ++-- db/db_range_del_test.cc | 2 +- db/db_secondary_test.cc | 4 ++-- db/db_table_properties_test.cc | 4 ++-- db/db_test.cc | 6 +++--- db/db_test_util.h | 8 ++++---- db/db_universal_compaction_test.cc | 2 +- db/db_wal_test.cc | 4 ++-- db/db_write_test.cc | 4 ++-- db/dbformat_test.cc | 2 +- db/deletefile_test.cc | 6 +++--- db/error_handler_test.cc | 4 ++-- db/external_sst_file_basic_test.cc | 2 +- db/external_sst_file_ingestion_job.cc | 2 +- db/external_sst_file_test.cc | 4 ++-- db/fault_injection_test.cc | 8 ++++---- db/file_indexer_test.cc | 4 ++-- db/filename_test.cc | 2 +- db/flush_job.cc | 2 +- db/flush_job_test.cc | 4 ++-- db/forward_iterator.cc | 2 +- db/forward_iterator_bench.cc | 2 +- db/listener_test.cc | 6 +++--- db/log_test.cc | 4 ++-- db/manual_compaction_test.cc | 2 +- db/memtable_list.cc | 2 +- db/memtable_list_test.cc | 4 ++-- db/merge_helper_test.cc | 4 ++-- db/merge_test.cc | 2 +- db/obsolete_files_test.cc | 6 +++--- db/options_file_test.cc | 2 +- db/perf_context_test.cc | 2 +- db/plain_table_db_test.cc | 4 ++-- db/prefix_test.cc | 2 +- db/range_del_aggregator_bench.cc | 2 +- db/range_del_aggregator_test.cc | 2 +- db/range_tombstone_fragmenter_test.cc | 2 +- db/table_cache.cc | 2 +- db/table_properties_collector_test.cc | 4 ++-- db/version_builder_test.cc | 4 ++-- db/version_edit.cc | 2 +- db/version_edit_test.cc | 4 ++-- db/version_set.cc | 2 +- db/version_set_test.cc | 4 ++-- db/wal_manager.cc | 2 +- db/wal_manager_test.cc | 4 ++-- db/write_batch_test.cc | 2 +- db/write_callback_test.cc | 4 ++-- db/write_controller_test.cc | 2 +- db/write_thread.cc | 2 +- env/env_basic_test.cc | 2 +- env/env_posix.cc | 2 +- env/env_test.cc | 6 +++--- env/io_posix.cc | 2 +- env/mock_env_test.cc | 2 +- env/posix_logger.h | 2 +- file/delete_scheduler.cc | 2 +- file/delete_scheduler_test.cc | 6 +++--- file/filename.cc | 2 +- file/sst_file_manager_impl.cc | 2 +- java/rocksjni/write_batch_test.cc | 2 +- memtable/inlineskiplist_test.cc | 2 +- memtable/memtablerep_bench.cc | 2 +- memtable/skiplist_test.cc | 2 +- memtable/write_buffer_manager_test.cc | 2 +- monitoring/histogram_test.cc | 2 +- monitoring/instrumented_mutex.cc | 2 +- monitoring/iostats_context_test.cc | 2 +- monitoring/statistics_test.cc | 4 ++-- options/options_parser.cc | 2 +- options/options_settable_test.cc | 2 +- options/options_test.cc | 4 ++-- port/win/env_default.cc | 2 +- port/win/io_win.cc | 2 +- src.mk | 12 ++++++------ table/block.h | 2 +- table/block_based_filter_block_test.cc | 4 ++-- table/block_based_table_reader.cc | 2 +- table/block_test.cc | 4 ++-- table/cleanable_test.cc | 4 ++-- table/cuckoo_table_builder_test.cc | 4 ++-- table/cuckoo_table_reader_test.cc | 4 ++-- table/data_block_hash_index_test.cc | 4 ++-- table/full_filter_block_test.cc | 4 ++-- table/merger_test.cc | 4 ++-- table/merging_iterator.cc | 2 +- table/meta_blocks.cc | 2 +- table/mock_table.h | 4 ++-- table/partitioned_filter_block_test.cc | 4 ++-- table/sst_file_reader_test.cc | 4 ++-- table/sst_file_writer.cc | 2 +- table/table_reader_bench.cc | 4 ++-- table/table_test.cc | 6 +++--- {util => test_util}/fault_injection_test_env.cc | 2 +- {util => test_util}/fault_injection_test_env.h | 0 {util => test_util}/mock_time_env.h | 0 {util => test_util}/sync_point.cc | 4 ++-- {util => test_util}/sync_point.h | 0 {util => test_util}/sync_point_impl.cc | 2 +- {util => test_util}/sync_point_impl.h | 2 +- {util => test_util}/testharness.cc | 2 +- {util => test_util}/testharness.h | 0 {util => test_util}/testutil.cc | 2 +- {util => test_util}/testutil.h | 0 {util => test_util}/transaction_test_util.cc | 2 +- {util => test_util}/transaction_test_util.h | 0 tools/db_bench_tool.cc | 4 ++-- tools/db_bench_tool_test.cc | 4 ++-- tools/db_repl_stress.cc | 2 +- tools/db_stress.cc | 4 ++-- tools/ldb_cmd_test.cc | 2 +- tools/reduce_levels_test.cc | 4 ++-- tools/sst_dump_test.cc | 4 ++-- tools/trace_analyzer_test.cc | 4 ++-- util/arena.cc | 2 +- util/arena_test.cc | 2 +- util/auto_roll_logger.h | 2 +- util/auto_roll_logger_test.cc | 4 ++-- util/autovector_test.cc | 4 ++-- util/bloom_test.cc | 4 ++-- util/coding_test.cc | 2 +- util/crc32c_test.cc | 2 +- util/dynamic_bloom_test.cc | 4 ++-- util/event_logger_test.cc | 2 +- util/file_reader_writer.cc | 2 +- util/file_reader_writer.h | 2 +- util/file_reader_writer_test.cc | 4 ++-- util/filelock_test.cc | 2 +- util/hash_test.cc | 2 +- util/log_write_bench.cc | 4 ++-- util/rate_limiter.cc | 2 +- util/rate_limiter_test.cc | 4 ++-- util/repeatable_thread.h | 2 +- util/repeatable_thread_test.cc | 4 ++-- util/slice_transform_test.cc | 2 +- util/thread_list_test.cc | 2 +- util/thread_local_test.cc | 6 +++--- util/timer_queue.h | 2 +- utilities/backupable/backupable_db.cc | 2 +- utilities/backupable/backupable_db_test.cc | 6 +++--- utilities/blob_db/blob_db_impl.cc | 2 +- utilities/blob_db/blob_db_test.cc | 6 +++--- utilities/cassandra/cassandra_format_test.cc | 2 +- utilities/cassandra/cassandra_functional_test.cc | 2 +- utilities/cassandra/cassandra_row_merge_test.cc | 2 +- utilities/cassandra/cassandra_serialize_test.cc | 2 +- utilities/cassandra/format.h | 2 +- utilities/cassandra/test_utils.h | 2 +- utilities/checkpoint/checkpoint_impl.cc | 2 +- utilities/checkpoint/checkpoint_test.cc | 6 +++--- utilities/env_librados_test.cc | 2 +- utilities/env_mirror_test.cc | 2 +- utilities/env_timed_test.cc | 2 +- utilities/memory/memory_test.cc | 4 ++-- .../string_append/stringappend_test.cc | 2 +- utilities/object_registry_test.cc | 2 +- utilities/options/options_util_test.cc | 4 ++-- utilities/persistent_cache/block_cache_tier.cc | 2 +- utilities/persistent_cache/hash_table_test.cc | 2 +- utilities/persistent_cache/persistent_cache_test.h | 2 +- .../transactions/optimistic_transaction_test.cc | 4 ++-- utilities/transactions/pessimistic_transaction.cc | 2 +- .../transactions/pessimistic_transaction_db.cc | 2 +- utilities/transactions/transaction_lock_mgr.cc | 2 +- utilities/transactions/transaction_test.cc | 10 +++++----- utilities/transactions/transaction_test.h | 10 +++++----- .../write_prepared_transaction_test.cc | 10 +++++----- utilities/transactions/write_prepared_txn_db.cc | 2 +- utilities/ttl/ttl_test.cc | 2 +- utilities/util_merge_operators_test.cc | 4 ++-- .../write_batch_with_index_test.cc | 2 +- 203 files changed, 322 insertions(+), 322 deletions(-) rename {util => test_util}/fault_injection_test_env.cc (99%) rename {util => test_util}/fault_injection_test_env.h (100%) rename {util => test_util}/mock_time_env.h (100%) rename {util => test_util}/sync_point.cc (95%) rename {util => test_util}/sync_point.h (100%) rename {util => test_util}/sync_point_impl.cc (98%) rename {util => test_util}/sync_point_impl.h (98%) rename {util => test_util}/testharness.cc (97%) rename {util => test_util}/testharness.h (100%) rename {util => test_util}/testutil.cc (99%) rename {util => test_util}/testutil.h (100%) rename {util => test_util}/transaction_test_util.cc (99%) rename {util => test_util}/transaction_test_util.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d74152d9d2..6449047fca6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -600,6 +600,10 @@ set(SOURCES table/sst_file_writer.cc table/table_properties.cc table/two_level_iterator.cc + test_util/sync_point.cc + test_util/sync_point_impl.cc + test_util/testutil.cc + test_util/transaction_test_util.cc tools/db_bench_tool.cc tools/dump/db_dump_tool.cc tools/ldb_cmd.cc @@ -629,13 +633,9 @@ set(SOURCES util/slice.cc util/status.cc util/string_util.cc - util/sync_point.cc - util/sync_point_impl.cc - util/testutil.cc util/thread_local.cc util/threadpool_imp.cc util/trace_replay.cc - util/transaction_test_util.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_compaction_filter.cc @@ -1006,7 +1006,7 @@ if(WITH_TESTS) tools/db_bench.cc table/table_reader_bench.cc utilities/persistent_cache/hash_table_bench.cc) - add_library(testharness OBJECT util/testharness.cc) + add_library(testharness OBJECT test_util/testharness.cc) foreach(sourcefile ${BENCHMARKS}) get_filename_component(exename ${sourcefile} NAME_WE) add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile} @@ -1020,7 +1020,7 @@ if(WITH_TESTS) db/db_test_util.cc monitoring/thread_status_updater_debug.cc table/mock_table.cc - util/fault_injection_test_env.cc + test_util/fault_injection_test_env.cc utilities/cassandra/test_utils.cc ) # test utilities are only build in debug diff --git a/Makefile b/Makefile index ec0a04ed106..16d5da0b16c 100644 --- a/Makefile +++ b/Makefile @@ -404,8 +404,8 @@ LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o) MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o) GTEST = $(GTEST_DIR)/gtest/gtest-all.o -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST) +TESTUTIL = ./test_util/testutil.o +TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST) VALGRIND_ERROR = 2 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) diff --git a/TARGETS b/TARGETS index 7d271515728..c438aa3fb45 100644 --- a/TARGETS +++ b/TARGETS @@ -207,6 +207,9 @@ cpp_library( "table/sst_file_writer.cc", "table/table_properties.cc", "table/two_level_iterator.cc", + "test_util/sync_point.cc", + "test_util/sync_point_impl.cc", + "test_util/transaction_test_util.cc", "tools/dump/db_dump_tool.cc", "tools/ldb_cmd.cc", "tools/ldb_tool.cc", @@ -235,12 +238,9 @@ cpp_library( "util/slice.cc", "util/status.cc", "util/string_util.cc", - "util/sync_point.cc", - "util/sync_point_impl.cc", "util/thread_local.cc", "util/threadpool_imp.cc", "util/trace_replay.cc", - "util/transaction_test_util.cc", "util/xxhash.cc", "utilities/backupable/backupable_db.cc", "utilities/blob_db/blob_compaction_filter.cc", @@ -309,10 +309,10 @@ cpp_library( srcs = [ "db/db_test_util.cc", "table/mock_table.cc", + "test_util/fault_injection_test_env.cc", + "test_util/testharness.cc", + "test_util/testutil.cc", "tools/trace_analyzer_tool.cc", - "util/fault_injection_test_env.cc", - "util/testharness.cc", - "util/testutil.cc", "utilities/cassandra/test_utils.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, @@ -326,9 +326,9 @@ cpp_library( cpp_library( name = "rocksdb_tools_lib", srcs = [ + "test_util/testutil.cc", "tools/db_bench_tool.cc", "tools/trace_analyzer_tool.cc", - "util/testutil.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index a5d71b65d4e..94b63a4e8bf 100644 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -118,7 +118,7 @@ def generate_targets(repo_path): "rocksdb_tools_lib", src_mk.get("BENCH_LIB_SOURCES", []) + src_mk.get("ANALYZER_LIB_SOURCES", []) + - ["util/testutil.cc"], + ["test_util/testutil.cc"], [":rocksdb_lib"]) # test for every test we found in the Makefile diff --git a/cache/cache_test.cc b/cache/cache_test.cc index f9f77234cdb..377ae146876 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -18,7 +18,7 @@ #include "cache/lru_cache.h" #include "util/coding.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 9980dd72b7b..575764611ce 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -8,7 +8,7 @@ #include #include #include "port/port.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/builder.cc b/db/builder.cc index b42ac187ef0..2b97ce1d608 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -34,7 +34,7 @@ #include "table/internal_iterator.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/column_family_test.cc b/db/column_family_test.cc index bdc832bd235..f5d57c35b78 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -21,11 +21,11 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "util/coding.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index ce80375e0e1..b97fd064e70 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -15,8 +15,8 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/compaction.cc b/db/compaction.cc index f8805376f1d..00ebd28b087 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -19,7 +19,7 @@ #include "db/column_family.h" #include "rocksdb/compaction_filter.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index ca55eef7123..7e060969962 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -9,7 +9,7 @@ #include "port/likely.h" #include "rocksdb/listener.h" #include "table/internal_iterator.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \ ((seq) <= (snapshot) && \ diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index c466f6c9122..b0a553136a3 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -10,8 +10,8 @@ #include "port/port.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 7d2015e5629..91c7f437a17 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -61,7 +61,7 @@ #include "util/random.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index 5ca6bf4a337..91441f5d76a 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -58,9 +58,9 @@ #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" #if !defined(IOS_CROSS_COMPILE) diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 60394cc9735..4608cceeac1 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -27,8 +27,8 @@ #include "table/mock_table.h" #include "util/file_reader_writer.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index f500def41ee..c01f2884d4c 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -25,7 +25,7 @@ #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index c759dae8b6c..82fc16f4f5a 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -14,8 +14,8 @@ #include "util/logging.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc index c25ae94fa1b..b8d23795fbc 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction_picker_universal.cc @@ -25,7 +25,7 @@ #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { namespace { diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index a7ff587949d..ba7042049cb 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -12,8 +12,8 @@ #include "util/hash.h" #include "util/kv_map.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; diff --git a/db/corruption_test.cc b/db/corruption_test.cc index ba97ca1502b..379c33e4599 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -28,8 +28,8 @@ #include "table/block_based_table_builder.h" #include "table/meta_blocks.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 2d4487ff454..ecd6d71ca2e 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -12,8 +12,8 @@ #include "table/cuckoo_table_reader.h" #include "table/meta_blocks.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 236a534657f..45524b250f7 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -10,9 +10,9 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif namespace rocksdb { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 91a04205e07..623836454db 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -14,8 +14,8 @@ #include "rocksdb/experimental.h" #include "rocksdb/utilities/convenience.h" #include "util/concurrent_task_limiter_impl.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index 46ba411b6fd..4ddc11986b8 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -7,7 +7,7 @@ #include "port/stack_trace.h" #include "rocksdb/perf_context.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif #include #include diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 5b630e21635..a1a1c8f99d6 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -23,7 +23,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/mutexlock.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 876605b2e48..b901a5a7805 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -9,8 +9,8 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_impl.cc b/db/db_impl.cc index e7ed1866469..749bd3629a0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -97,7 +97,7 @@ #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 1e39bdd4271..c5cc0736665 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -22,7 +22,7 @@ #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" #include "util/concurrent_task_limiter_impl.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 4240b2012dc..0be85031ba3 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -20,7 +20,7 @@ #include "rocksdb/wal_filter.h" #include "table/block_based_table_factory.h" #include "util/rate_limiter.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { Options SanitizeOptions(const std::string& dbname, const Options& src) { diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 92edc84254c..98463f7b27f 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -16,7 +16,7 @@ #include "db/event_helpers.h" #include "monitoring/perf_context_imp.h" #include "options/options_helper.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { // Convenience methods diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index a0f1dfeab45..8c3588e9abd 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -10,7 +10,7 @@ #include "rocksdb/slice.h" #include "util/random.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" #ifdef GFLAGS diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 29fbd320861..49e670abc28 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -18,8 +18,8 @@ #include "table/iterator_wrapper.h" #include "table/merging_iterator.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_options_test.cc b/db/db_options_test.cc index cb9a0e02e61..37a9f1a365b 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -20,8 +20,8 @@ #include "rocksdb/rate_limiter.h" #include "rocksdb/stats_history.h" #include "util/random.h" -#include "util/sync_point.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index aa63286f60a..16d682fc083 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -5,7 +5,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/testutil.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index a4267c7d596..50a0923b4c8 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -10,8 +10,8 @@ #include "db/db_impl_secondary.h" #include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 77ea0020dd6..82f106133e8 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -14,8 +14,8 @@ #include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/utilities/table_properties_collectors.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifndef ROCKSDB_LITE diff --git a/db/db_test.cc b/db/db_test.cc index 7864a7e2c65..66df2323de2 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -63,9 +63,9 @@ #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_test_util.h b/db/db_test_util.h index 81186bfb9ad..3bc107889b4 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -46,13 +46,13 @@ #include "table/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" -#include "util/mock_time_env.h" +#include "test_util/mock_time_env.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 2bd8af684e0..4f1df4a7d57 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -11,7 +11,7 @@ #include "port/stack_trace.h" #if !defined(ROCKSDB_LITE) #include "rocksdb/utilities/table_properties_collectors.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 78f72b4a0e7..9a1382e98ab 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -11,8 +11,8 @@ #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { class DBWALTest : public DBTestBase { diff --git a/db/db_write_test.cc b/db/db_write_test.cc index e6bab875114..322381b3867 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -12,9 +12,9 @@ #include "db/write_thread.h" #include "port/port.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 0b16c13f573..e3f06fe6b65 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -9,7 +9,7 @@ #include "db/dbformat.h" #include "util/logging.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 81ff8d0b99f..9c67102c5f0 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -21,9 +21,9 @@ #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/error_handler_test.cc b/db/error_handler_test.cc index d33e19df5d5..c18706fc28e 100644 --- a/db/error_handler_test.cc +++ b/db/error_handler_test.cc @@ -12,9 +12,9 @@ #include "port/stack_trace.h" #include "rocksdb/perf_context.h" #include "rocksdb/sst_file_manager.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif namespace rocksdb { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 256db0728bf..91a422bed9e 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -9,7 +9,7 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" -#include "util/testutil.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 7bfc64f77cb..26cd1127b94 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -24,7 +24,7 @@ #include "table/table_builder.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 0a0994f0ea9..ebd6cb2b160 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -11,8 +11,8 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" -#include "util/fault_injection_test_env.h" -#include "util/testutil.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 1bfaa299456..330df7bfe48 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -21,12 +21,12 @@ #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 935a01ef8dd..754cb3c4651 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -13,8 +13,8 @@ #include "db/version_edit.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/filename_test.cc b/db/filename_test.cc index 869469f3f0c..dabe673d849 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -12,7 +12,7 @@ #include "db/dbformat.h" #include "port/port.h" #include "util/logging.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/flush_job.cc b/db/flush_job.cc index 46915ca13a8..4930ecac7e9 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -51,7 +51,7 @@ #include "util/logging.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 199ed29cacc..d97ad9f0c2d 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -15,8 +15,8 @@ #include "table/mock_table.h" #include "util/file_reader_writer.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 94e448ee97d..f95debec62c 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -22,7 +22,7 @@ #include "rocksdb/slice_transform.h" #include "table/merging_iterator.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 113ded94b69..9d6851dab16 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -35,7 +35,7 @@ int main() { return 0; } #include "rocksdb/status.h" #include "rocksdb/table.h" #include "util/gflags_compat.h" -#include "util/testharness.h" +#include "test_util/testharness.h" const int MAX_SHARDS = 100000; diff --git a/db/listener_test.cc b/db/listener_test.cc index 6b716a1d4b1..663116b7b8d 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -29,9 +29,9 @@ #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" #ifndef ROCKSDB_LITE diff --git a/db/log_test.cc b/db/log_test.cc index fd237b030e7..5b159acf21f 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -14,8 +14,8 @@ #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { namespace log { diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 02732a55583..35e5019ca7e 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -12,7 +12,7 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/slice.h" #include "rocksdb/write_batch.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "port/port.h" using namespace rocksdb; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index bdcbd218663..b50b58a1af7 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -24,7 +24,7 @@ #include "table/merging_iterator.h" #include "util/coding.h" #include "util/log_buffer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index a14c13b893b..59da8af1664 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -14,8 +14,8 @@ #include "rocksdb/status.h" #include "rocksdb/write_buffer_manager.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc index b61092ee575..dc3624af53e 100644 --- a/db/merge_helper_test.cc +++ b/db/merge_helper_test.cc @@ -10,8 +10,8 @@ #include "db/merge_helper.h" #include "rocksdb/comparator.h" #include "util/coding.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/merge_test.cc b/db/merge_test.cc index 3bd4b9a6004..d3dadaa5d30 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -18,7 +18,7 @@ #include "db/db_impl.h" #include "db/write_batch_internal.h" #include "utilities/merge_operators.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 6bf2acf8519..c6e7d6af07a 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -21,9 +21,9 @@ #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using std::cerr; using std::cout; diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 0a9a34ff0b5..c7eba52c290 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -10,7 +10,7 @@ #include "db/db_test_util.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { class OptionsFileTest : public testing::Test { diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index b7efec182a1..42d592862c7 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -19,7 +19,7 @@ #include "rocksdb/slice_transform.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" bool FLAGS_random_key = false; diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index ef770c2e50b..7648ed85ff7 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -33,8 +33,8 @@ #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; diff --git a/db/prefix_test.cc b/db/prefix_test.cc index be420ded183..e8290e76bca 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -31,7 +31,7 @@ int main() { #include "util/random.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 34b2f7e5db1..54a86169b20 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -26,7 +26,7 @@ int main() { #include "util/coding.h" #include "util/random.h" #include "util/stop_watch.h" -#include "util/testutil.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 28c8129ecb0..7ce666326a8 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -12,7 +12,7 @@ #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" -#include "util/testutil.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index ddd3f774176..11f3574967d 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -7,7 +7,7 @@ #include "db/db_test_util.h" #include "rocksdb/comparator.h" -#include "util/testutil.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/table_cache.cc b/db/table_cache.cc index 01724dfc5cb..4efd3fdf759 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -25,7 +25,7 @@ #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index ea561e982ff..6171b2938c2 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -20,8 +20,8 @@ #include "table/table_builder.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 514952bb5b1..5c3bd686b1c 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -8,8 +8,8 @@ #include "db/version_set.h" #include "util/logging.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/version_edit.cc b/db/version_edit.cc index 01ec44515a7..018517a1381 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -14,7 +14,7 @@ #include "util/coding.h" #include "util/event_logger.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 64d1fd77bc1..5f1ae98ba4f 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -9,8 +9,8 @@ #include "db/version_edit.h" #include "util/coding.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/version_set.cc b/db/version_set.cc index c10eb9f7ac3..b9616f3730b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -52,7 +52,7 @@ #include "util/file_reader_writer.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 43924a3addd..41c27fdab65 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -12,8 +12,8 @@ #include "table/mock_table.h" #include "util/logging.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/wal_manager.cc b/db/wal_manager.cc index cce714750e7..20b5780c877 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -34,7 +34,7 @@ #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 379f12f52aa..b1478e26e54 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -21,8 +21,8 @@ #include "table/mock_table.h" #include "util/file_reader_writer.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 322bd8945b0..88c52522917 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -19,7 +19,7 @@ #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 7f2b20d892f..dbb4759fa03 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -17,8 +17,8 @@ #include "rocksdb/write_batch.h" #include "port/port.h" #include "util/random.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" using std::string; diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index 55feb00a339..919c2c11808 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -8,7 +8,7 @@ #include "db/write_controller.h" #include "rocksdb/env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/write_thread.cc b/db/write_thread.cc index 835992c8fce..872d32ca81b 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -10,7 +10,7 @@ #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "util/random.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 93764d945f9..f306edbd6ba 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -12,7 +12,7 @@ #include "env/mock_env.h" #include "rocksdb/env.h" #include "rocksdb/utilities/object_registry.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/env/env_posix.cc b/env/env_posix.cc index 387c0279397..3f75dd6893c 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -54,7 +54,7 @@ #include "util/logging.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/thread_local.h" #include "util/threadpool_imp.h" diff --git a/env/env_test.cc b/env/env_test.cc index 47800928499..852a99c1adc 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -44,9 +44,9 @@ #include "util/log_buffer.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifdef OS_LINUX static const size_t kPageSize = sysconf(_SC_PAGESIZE); diff --git a/env/io_posix.cc b/env/io_posix.cc index 0ced06ff262..27198b1f975 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -33,7 +33,7 @@ #include "rocksdb/slice.h" #include "util/coding.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) #define F_LINUX_SPECIFIC_BASE 1024 diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc index 97c49b5f516..b21b953b568 100644 --- a/env/mock_env_test.cc +++ b/env/mock_env_test.cc @@ -10,7 +10,7 @@ #include #include "rocksdb/env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/env/posix_logger.h b/env/posix_logger.h index 401df6a3ffb..8406a6d8acc 100644 --- a/env/posix_logger.h +++ b/env/posix_logger.h @@ -27,7 +27,7 @@ #include "env/io_posix.h" #include "monitoring/iostats_context_imp.h" #include "rocksdb/env.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 41ec84376b6..44e3110d5e7 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -15,7 +15,7 @@ #include "rocksdb/env.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index c8544004cd5..122a5d6177e 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifndef ROCKSDB_LITE diff --git a/file/filename.cc b/file/filename.cc index 0a48dc78c36..ed19b4109ff 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -21,7 +21,7 @@ #include "util/logging.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index 86bcb2d19ca..9b7278c7d5b 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -17,7 +17,7 @@ #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" #include "util/mutexlock.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 266fb4abf74..9d5de9a2f86 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -23,7 +23,7 @@ #include "rocksjni/portal.h" #include "table/scoped_arena_iterator.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" /* * Class: org_rocksdb_WriteBatchTest diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index b416ef7c557..a2f62d5304a 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -14,7 +14,7 @@ #include "util/concurrent_arena.h" #include "util/hash.h" #include "util/random.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 51ff11a015c..ae199096563 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -39,7 +39,7 @@ int main() { #include "util/gflags_compat.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "util/testutil.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::RegisterFlagValidator; diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 50c3588bb86..054e3c9df07 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -13,7 +13,7 @@ #include "util/arena.h" #include "util/hash.h" #include "util/random.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 3c89c8095e1..06514eabde4 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc index df58822fc21..ed9a7bd32ff 100644 --- a/monitoring/histogram_test.cc +++ b/monitoring/histogram_test.cc @@ -7,7 +7,7 @@ #include "monitoring/histogram.h" #include "monitoring/histogram_windowing.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc index 7b61bcf4fb8..796bb26dd4b 100644 --- a/monitoring/instrumented_mutex.cc +++ b/monitoring/instrumented_mutex.cc @@ -6,7 +6,7 @@ #include "monitoring/instrumented_mutex.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { namespace { diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc index 74d3e43291d..28d305d021a 100644 --- a/monitoring/iostats_context_test.cc +++ b/monitoring/iostats_context_test.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/iostats_context.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc index a77022bfb3d..162afb264b2 100644 --- a/monitoring/statistics_test.cc +++ b/monitoring/statistics_test.cc @@ -5,8 +5,8 @@ // #include "port/stack_trace.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "rocksdb/statistics.h" diff --git a/options/options_parser.cc b/options/options_parser.cc index f09e53e4a49..9ae3dfb2785 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -19,7 +19,7 @@ #include "util/cast_util.h" #include "util/file_reader_writer.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "port/port.h" diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 79a4fa81475..2e21a2688f8 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -15,7 +15,7 @@ #include "options/options_helper.h" #include "rocksdb/convenience.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #ifndef GFLAGS bool FLAGS_enable_print = false; diff --git a/options/options_test.cc b/options/options_test.cc index ded336dd18d..704b2db802b 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -30,8 +30,8 @@ #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators/bytesxor.h" #ifndef GFLAGS diff --git a/port/win/env_default.cc b/port/win/env_default.cc index d24c21918aa..db64878bc02 100644 --- a/port/win/env_default.cc +++ b/port/win/env_default.cc @@ -12,7 +12,7 @@ #include #include "port/win/env_win.h" #include "util/compression_context_cache.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/thread_local.h" namespace rocksdb { diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 64ded8465d0..15d1e711412 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -12,7 +12,7 @@ #include "monitoring/iostats_context_imp.h" #include "util/aligned_buffer.h" #include "util/coding.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { namespace port { diff --git a/src.mk b/src.mk index 2541b9fd12b..100b3355e74 100644 --- a/src.mk +++ b/src.mk @@ -131,6 +131,9 @@ LIB_SOURCES = \ table/sst_file_writer.cc \ table/table_properties.cc \ table/two_level_iterator.cc \ + test_util/sync_point.cc \ + test_util/sync_point_impl.cc \ + test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ util/arena.cc \ util/auto_roll_logger.cc \ @@ -156,12 +159,9 @@ LIB_SOURCES = \ util/slice.cc \ util/status.cc \ util/string_util.cc \ - util/sync_point.cc \ - util/sync_point_impl.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ util/trace_replay.cc \ - util/transaction_test_util.cc \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ utilities/blob_db/blob_compaction_filter.cc \ @@ -242,15 +242,15 @@ ANALYZER_LIB_SOURCES = \ MOCK_LIB_SOURCES = \ table/mock_table.cc \ - util/fault_injection_test_env.cc + test_util/fault_injection_test_env.cc BENCH_LIB_SOURCES = \ tools/db_bench_tool.cc \ TEST_LIB_SOURCES = \ db/db_test_util.cc \ - util/testharness.cc \ - util/testutil.cc \ + test_util/testharness.cc \ + test_util/testutil.cc \ utilities/cassandra/test_utils.cc \ MAIN_SOURCES = \ diff --git a/table/block.h b/table/block.h index df4d4eb82fc..869d2f1f286 100644 --- a/table/block.h +++ b/table/block.h @@ -31,7 +31,7 @@ #include "table/data_block_hash_index.h" #include "table/internal_iterator.h" #include "util/random.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc index 6b352b2f6b0..2cb3abc27a6 100644 --- a/table/block_based_filter_block_test.cc +++ b/table/block_based_filter_block_test.cc @@ -13,8 +13,8 @@ #include "util/coding.h" #include "util/hash.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 34e40979247..a45fc0a5b47 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -51,7 +51,7 @@ #include "util/file_reader_writer.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/xxhash.h" namespace rocksdb { diff --git a/table/block_test.cc b/table/block_test.cc index 3e0ff3eab59..d359b4e59ca 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -23,8 +23,8 @@ #include "table/block_builder.h" #include "table/format.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc index f18c33b8399..8478adf523d 100644 --- a/table/cleanable_test.cc +++ b/table/cleanable_test.cc @@ -9,8 +9,8 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index c1e350327f3..eeba9480592 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -13,8 +13,8 @@ #include "table/meta_blocks.h" #include "table/cuckoo_table_builder.h" #include "util/file_reader_writer.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { extern const uint64_t kCuckooTableMagicNumber; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 74fb52e6c78..6d596f6e115 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -31,8 +31,8 @@ int main() { #include "util/gflags_compat.h" #include "util/random.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/table/data_block_hash_index_test.cc b/table/data_block_hash_index_test.cc index 11226648ef2..0511b257aa3 100644 --- a/table/data_block_hash_index_test.cc +++ b/table/data_block_hash_index_test.cc @@ -15,8 +15,8 @@ #include "table/data_block_hash_index.h" #include "table/get_context.h" #include "table/table_builder.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc index 3abae979a4c..0ef5c5a970c 100644 --- a/table/full_filter_block_test.cc +++ b/table/full_filter_block_test.cc @@ -10,8 +10,8 @@ #include "util/coding.h" #include "util/hash.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/merger_test.cc b/table/merger_test.cc index 1b04d065727..8efa2834db6 100644 --- a/table/merger_test.cc +++ b/table/merger_test.cc @@ -7,8 +7,8 @@ #include #include "table/merging_iterator.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 244b5e82c3d..85a2fcc0324 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -23,7 +23,7 @@ #include "util/autovector.h" #include "util/heap.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { // Without anonymous namespace here, we fail the warning -Wmissing-prototypes diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 3f48095c55b..98e05a4d032 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -18,7 +18,7 @@ #include "table/table_properties_internal.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/table/mock_table.h b/table/mock_table.h index 5bca14644d8..f99941863a9 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -20,8 +20,8 @@ #include "table/table_builder.h" #include "table/table_reader.h" #include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { namespace mock { diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index 8afa530d71a..4bdc2fd36f1 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -13,8 +13,8 @@ #include "util/coding.h" #include "util/hash.h" #include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 51bc975af00..529634ccd75 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -11,8 +11,8 @@ #include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" #include "table/sst_file_writer_collectors.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index b9a7273e07d..71b395fd6be 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -11,7 +11,7 @@ #include "table/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" #include "util/file_reader_writer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index a9b75715b5f..6b05d385e06 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -24,8 +24,8 @@ int main() { #include "table/table_builder.h" #include "util/file_reader_writer.h" #include "util/gflags_compat.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/table/table_test.cc b/table/table_test.cc index 7292ad7c32d..dccc4919409 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -49,9 +49,9 @@ #include "util/compression.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc similarity index 99% rename from util/fault_injection_test_env.cc rename to test_util/fault_injection_test_env.cc index 9cad23871b6..a591ff4b57b 100644 --- a/util/fault_injection_test_env.cc +++ b/test_util/fault_injection_test_env.cc @@ -11,7 +11,7 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include #include diff --git a/util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h similarity index 100% rename from util/fault_injection_test_env.h rename to test_util/fault_injection_test_env.h diff --git a/util/mock_time_env.h b/test_util/mock_time_env.h similarity index 100% rename from util/mock_time_env.h rename to test_util/mock_time_env.h diff --git a/util/sync_point.cc b/test_util/sync_point.cc similarity index 95% rename from util/sync_point.cc rename to test_util/sync_point.cc index 4599c256d9f..a09be9e8fa1 100644 --- a/util/sync_point.cc +++ b/test_util/sync_point.cc @@ -3,8 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point.h" -#include "util/sync_point_impl.h" +#include "test_util/sync_point.h" +#include "test_util/sync_point_impl.h" int rocksdb_kill_odds = 0; std::vector rocksdb_kill_prefix_blacklist; diff --git a/util/sync_point.h b/test_util/sync_point.h similarity index 100% rename from util/sync_point.h rename to test_util/sync_point.h diff --git a/util/sync_point_impl.cc b/test_util/sync_point_impl.cc similarity index 98% rename from util/sync_point_impl.cc rename to test_util/sync_point_impl.cc index 248c381a328..db44f472a05 100644 --- a/util/sync_point_impl.cc +++ b/test_util/sync_point_impl.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point_impl.h" +#include "test_util/sync_point_impl.h" #ifndef NDEBUG namespace rocksdb { diff --git a/util/sync_point_impl.h b/test_util/sync_point_impl.h similarity index 98% rename from util/sync_point_impl.h rename to test_util/sync_point_impl.h index 3c7e7049183..d96d7325786 100644 --- a/util/sync_point_impl.h +++ b/test_util/sync_point_impl.h @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include #include diff --git a/util/testharness.cc b/test_util/testharness.cc similarity index 97% rename from util/testharness.cc rename to test_util/testharness.cc index 8f5eb2a4d6e..62cc535a198 100644 --- a/util/testharness.cc +++ b/test_util/testharness.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/testharness.h" +#include "test_util/testharness.h" #include #include diff --git a/util/testharness.h b/test_util/testharness.h similarity index 100% rename from util/testharness.h rename to test_util/testharness.h diff --git a/util/testutil.cc b/test_util/testutil.cc similarity index 99% rename from util/testutil.cc rename to test_util/testutil.cc index b6493258f60..18e1a45bb36 100644 --- a/util/testutil.cc +++ b/test_util/testutil.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/testutil.h" +#include "test_util/testutil.h" #include #include diff --git a/util/testutil.h b/test_util/testutil.h similarity index 100% rename from util/testutil.h rename to test_util/testutil.h diff --git a/util/transaction_test_util.cc b/test_util/transaction_test_util.cc similarity index 99% rename from util/transaction_test_util.cc rename to test_util/transaction_test_util.cc index bd2d6afdca0..14d39065182 100644 --- a/util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -8,7 +8,7 @@ #define __STDC_FORMAT_MACROS #endif -#include "util/transaction_test_util.h" +#include "test_util/transaction_test_util.h" #include #include diff --git a/util/transaction_test_util.h b/test_util/transaction_test_util.h similarity index 100% rename from util/transaction_test_util.h rename to test_util/transaction_test_util.h diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 2ceca4fd950..12caa2809ad 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -68,8 +68,8 @@ #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "util/testutil.h" -#include "util/transaction_test_util.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" #include "util/xxhash.h" #include "utilities/blob_db/blob_db.h" #include "utilities/merge_operators.h" diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index 1b19de5f17e..52a1f9b91eb 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -11,8 +11,8 @@ #include "options/options_parser.h" #include "rocksdb/utilities/options_util.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifdef GFLAGS #include "util/gflags_compat.h" diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index c640b5945b0..41ae4c2761e 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -19,7 +19,7 @@ int main() { #include "rocksdb/db.h" #include "rocksdb/types.h" #include "util/gflags_compat.h" -#include "util/testutil.h" +#include "test_util/testutil.h" // Run a thread to perform Put's. // Another thread uses GetUpdatesSince API to keep getting the updates. diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 579178efffc..72461b13ab4 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -72,9 +72,9 @@ int main() { #include "util/string_util.h" // SyncPoint is not supported in Released Windows Mode. #if !(defined NDEBUG) || !defined(OS_WIN) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif // !(defined NDEBUG) || !defined(OS_WIN) -#include "util/testutil.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index 3b709953373..24622b7ccf3 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/ldb_cmd.h" -#include "util/testharness.h" +#include "test_util/testharness.h" using std::string; using std::vector; diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index 1718b3344e9..a76416b6c1d 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -12,8 +12,8 @@ #include "rocksdb/utilities/ldb_cmd.h" #include "tools/ldb_cmd_impl.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index 6bf3e3b97a1..a2c226b926c 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -16,8 +16,8 @@ #include "table/block_based_table_factory.h" #include "table/table_builder.h" #include "util/file_reader_writer.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index b2cc777d5a4..2f31c5d8249 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -28,8 +28,8 @@ int main() { #include "rocksdb/status.h" #include "rocksdb/trace_reader_writer.h" #include "tools/trace_analyzer_tool.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/trace_replay.h" namespace rocksdb { diff --git a/util/arena.cc b/util/arena.cc index d7799eb266a..67e8a4db782 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -22,7 +22,7 @@ #include "port/port.h" #include "rocksdb/env.h" #include "util/logging.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/arena_test.cc b/util/arena_test.cc index 9dfc28ab2ea..052f2a6d5db 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -9,7 +9,7 @@ #include "util/arena.h" #include "util/random.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index 24f4714b4fd..5a2049b6405 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -14,7 +14,7 @@ #include "port/port.h" #include "port/util_logger.h" #include "util/mutexlock.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc index ab9e0595808..3adbdbb1363 100644 --- a/util/auto_roll_logger_test.cc +++ b/util/auto_roll_logger_test.cc @@ -20,8 +20,8 @@ #include "port/port.h" #include "rocksdb/db.h" #include "util/logging.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" namespace rocksdb { namespace { diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 13299669cd4..edb7af9eaf2 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -11,8 +11,8 @@ #include "rocksdb/env.h" #include "util/autovector.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using std::cout; using std::endl; diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 4b25e9b6c6f..87cd9da5569 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -22,8 +22,8 @@ int main() { #include "util/arena.h" #include "util/gflags_compat.h" #include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/coding_test.cc b/util/coding_test.cc index f7b1671d1ec..7f73e00e155 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -9,7 +9,7 @@ #include "util/coding.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc index d5983586bc6..90f0c815cc2 100644 --- a/util/crc32c_test.cc +++ b/util/crc32c_test.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/crc32c.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "util/coding.h" namespace rocksdb { diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 4244bff1a4e..a8a7000f648 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -29,8 +29,8 @@ int main() { #include "util/gflags_compat.h" #include "util/logging.h" #include "util/stop_watch.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/event_logger_test.cc b/util/event_logger_test.cc index 4bcf30ff5eb..16c6c59f70e 100644 --- a/util/event_logger_test.cc +++ b/util/event_logger_test.cc @@ -6,7 +6,7 @@ #include #include "util/event_logger.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 9a818cb0f07..3003a1ebac0 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -17,7 +17,7 @@ #include "port/port.h" #include "util/random.h" #include "util/rate_limiter.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 1ef23e8c936..317c1d6c78c 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -16,7 +16,7 @@ #include "rocksdb/listener.h" #include "rocksdb/rate_limiter.h" #include "util/aligned_buffer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 6a7ea6d7da4..18bb65a72bb 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -7,8 +7,8 @@ #include #include #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/util/filelock_test.cc b/util/filelock_test.cc index f8721b5909a..bd0fc7c4221 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -9,7 +9,7 @@ #include #include #include "util/coding.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/hash_test.cc b/util/hash_test.cc index 959e8cd0f68..6618c5a4bc1 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -10,7 +10,7 @@ #include #include "util/hash.h" -#include "util/testharness.h" +#include "test_util/testharness.h" // The hash algorithm is part of the file format, for example for the Bloom // filters. Test that the hash values are stable for a set of random strings of diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index 5c9b3e84bf4..dd5322151e3 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -15,8 +15,8 @@ int main() { #include "rocksdb/env.h" #include "util/file_reader_writer.h" #include "util/gflags_compat.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index 9d23c38f7ac..93665837fc4 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -12,7 +12,7 @@ #include "port/port.h" #include "rocksdb/env.h" #include "util/aligned_buffer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index d3f3be3ba95..3316a75b571 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -20,8 +20,8 @@ #include "db/db_test_util.h" #include "rocksdb/env.h" #include "util/random.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h index 2d4729da02c..4226f35396c 100644 --- a/util/repeatable_thread.h +++ b/util/repeatable_thread.h @@ -10,7 +10,7 @@ #include "port/port.h" #include "rocksdb/env.h" -#include "util/mock_time_env.h" +#include "test_util/mock_time_env.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc index ee853c1056f..29af340d7cb 100644 --- a/util/repeatable_thread_test.cc +++ b/util/repeatable_thread_test.cc @@ -8,8 +8,8 @@ #include "db/db_test_util.h" #include "util/repeatable_thread.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" class RepeatableThreadTest : public testing::Test { public: diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc index f91675ccec8..96d90a9cd9b 100644 --- a/util/slice_transform_test.cc +++ b/util/slice_transform_test.cc @@ -14,7 +14,7 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index a4a343a9cf4..37f59bab8ca 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -8,7 +8,7 @@ #include "monitoring/thread_status_updater.h" #include "rocksdb/db.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #ifdef ROCKSDB_USING_THREAD_STATUS diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 789be83d8fd..787638138c0 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -10,9 +10,9 @@ #include "rocksdb/env.h" #include "port/port.h" #include "util/autovector.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/thread_local.h" namespace rocksdb { diff --git a/util/timer_queue.h b/util/timer_queue.h index bd8a4f85048..a5f74ae5679 100644 --- a/util/timer_queue.h +++ b/util/timer_queue.h @@ -32,7 +32,7 @@ #include #include "port/port.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" // Allows execution of handlers at a specified time in the future // Guarantees: diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 149eb911f7f..816c9718b2d 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -20,7 +20,7 @@ #include "util/file_reader_writer.h" #include "util/logging.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "utilities/checkpoint/checkpoint_impl.h" #ifndef __STDC_FORMAT_MACROS diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index e4abd96e95f..c7377064f82 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -27,9 +27,9 @@ #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 9f3839370eb..54eb3f2dbb5 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -33,7 +33,7 @@ #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/timer_queue.h" #include "utilities/blob_db/blob_compaction_filter.h" #include "utilities/blob_db/blob_db_iterator.h" diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index e24ba1d983c..19dce3f87d7 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -19,11 +19,11 @@ #include "port/port.h" #include "rocksdb/utilities/debug.h" #include "util/cast_util.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "utilities/blob_db/blob_db.h" #include "utilities/blob_db/blob_db_impl.h" #include "utilities/blob_db/blob_index.h" diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc index 8f9baa72357..7af21247eb1 100644 --- a/utilities/cassandra/cassandra_format_test.cc +++ b/utilities/cassandra/cassandra_format_test.cc @@ -5,7 +5,7 @@ #include #include -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/serialize.h" #include "utilities/cassandra/test_utils.h" diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index dacc6f03ce3..347846d075c 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -8,7 +8,7 @@ #include "db/db_impl.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "util/random.h" #include "utilities/merge_operators.h" #include "utilities/cassandra/cassandra_compaction_filter.h" diff --git a/utilities/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc index 8d6dc10ded0..88dee118b5b 100644 --- a/utilities/cassandra/cassandra_row_merge_test.cc +++ b/utilities/cassandra/cassandra_row_merge_test.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/test_utils.h" diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc index 68d2c163d96..bfce2a36e30 100644 --- a/utilities/cassandra/cassandra_serialize_test.cc +++ b/utilities/cassandra/cassandra_serialize_test.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/cassandra/serialize.h" using namespace rocksdb::cassandra; diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h index 09a4923565f..562c1aff3ff 100644 --- a/utilities/cassandra/format.h +++ b/utilities/cassandra/format.h @@ -60,7 +60,7 @@ #include #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { namespace cassandra { diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h index 80374b0cbab..f58bd730015 100644 --- a/utilities/cassandra/test_utils.h +++ b/utilities/cassandra/test_utils.h @@ -5,7 +5,7 @@ #pragma once #include -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/serialize.h" diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 920f9bf535b..7468c8eedee 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -28,7 +28,7 @@ #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" #include "rocksdb/utilities/checkpoint.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 9318a733dcf..da2972affd7 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -23,9 +23,9 @@ #include "rocksdb/env.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/transaction_db.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" namespace rocksdb { class CheckpointTest : public testing::Test { diff --git a/utilities/env_librados_test.cc b/utilities/env_librados_test.cc index 1a3746860b6..e5f91894599 100644 --- a/utilities/env_librados_test.cc +++ b/utilities/env_librados_test.cc @@ -9,7 +9,7 @@ #include "rocksdb/utilities/env_librados.h" #include #include "env/mock_env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "rocksdb/db.h" #include "rocksdb/slice.h" diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc index 3c0ed228522..6b20f1f1334 100644 --- a/utilities/env_mirror_test.cc +++ b/utilities/env_mirror_test.cc @@ -8,7 +8,7 @@ #include "rocksdb/utilities/env_mirror.h" #include "env/mock_env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/utilities/env_timed_test.cc b/utilities/env_timed_test.cc index 8bdef6396e0..989c79a391d 100644 --- a/utilities/env_timed_test.cc +++ b/utilities/env_timed_test.cc @@ -7,7 +7,7 @@ #include "rocksdb/env.h" #include "rocksdb/perf_context.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 8d976ef9214..c3ff640816e 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -12,8 +12,8 @@ #include "rocksdb/utilities/stackable_db.h" #include "table/block_based_table_factory.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 54c89a03abf..160bd347bd2 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -15,7 +15,7 @@ #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/merge_operators/string_append/stringappend2.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "util/random.h" using namespace rocksdb; diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc index 4444d8712f9..cc7c38d8a65 100644 --- a/utilities/object_registry_test.cc +++ b/utilities/object_registry_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/object_registry.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index ed7bfdfd6f7..342db490280 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -18,8 +18,8 @@ #include "rocksdb/table.h" #include "rocksdb/utilities/options_util.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifndef GFLAGS bool FLAGS_enable_print = false; diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index f7f72df6dfc..775ef29cf8d 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -13,7 +13,7 @@ #include "port/port.h" #include "util/logging.h" #include "util/stop_watch.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "utilities/persistent_cache/block_cache_tier_file.h" namespace rocksdb { diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index d6ff3e68e42..51ad211e929 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -11,7 +11,7 @@ #include "db/db_test_util.h" #include "util/arena.h" #include "util/random.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/persistent_cache/hash_table.h" #include "utilities/persistent_cache/hash_table_evictable.h" diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h index ad99ea864bd..33cda4ea72d 100644 --- a/utilities/persistent_cache/persistent_cache_test.h +++ b/utilities/persistent_cache/persistent_cache_test.h @@ -23,7 +23,7 @@ #include "table/block_builder.h" #include "port/port.h" #include "util/arena.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/persistent_cache/volatile_tier_impl.h" namespace rocksdb { diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index fbb0d44fdc7..e3105a2139c 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -15,8 +15,8 @@ #include "util/crc32c.h" #include "util/logging.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/transaction_test_util.h" +#include "test_util/testharness.h" +#include "test_util/transaction_test_util.h" #include "port/port.h" using std::string; diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index d0e4f20467b..fd9da17aac4 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -21,7 +21,7 @@ #include "rocksdb/utilities/transaction_db.h" #include "util/cast_util.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction_db.h" #include "utilities/transactions/transaction_util.h" diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index c1b37c148f5..95c88594ca9 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -22,7 +22,7 @@ #include "rocksdb/utilities/transaction_db.h" #include "util/cast_util.h" #include "util/mutexlock.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/transaction_db_mutex_impl.h" #include "utilities/transactions/write_prepared_txn_db.h" diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index 48d496bfd7f..173e012d88a 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -25,7 +25,7 @@ #include "rocksdb/utilities/transaction_db_mutex.h" #include "util/cast_util.h" #include "util/hash.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "util/thread_local.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 3c8036614f0..d183401f42f 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -23,13 +23,13 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" -#include "util/transaction_test_util.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 2e3b9952709..8dfa6b053c5 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -21,13 +21,13 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" -#include "util/transaction_test_util.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 7b5a585df91..5287cca2038 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -27,14 +27,14 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" -#include "util/transaction_test_util.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 5250f3f2de5..0508a596e43 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -24,7 +24,7 @@ #include "util/cast_util.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/transaction_db_mutex_impl.h" diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index 88e90af269c..c7d8f52aa52 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -10,7 +10,7 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/utilities/db_ttl.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #ifndef OS_WIN #include #endif diff --git a/utilities/util_merge_operators_test.cc b/utilities/util_merge_operators_test.cc index d8b3cfba69c..d591ac8f12c 100644 --- a/utilities/util_merge_operators_test.cc +++ b/utilities/util_merge_operators_test.cc @@ -3,8 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index be715fe32ca..f8875d9ac1f 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -16,7 +16,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "util/random.h" #include "util/string_util.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" From bd44ec2006fbb44d632ee7be7cf8f553d90b09d9 Mon Sep 17 00:00:00 2001 From: anand76 Date: Thu, 30 May 2019 11:38:02 -0700 Subject: [PATCH 082/572] Fix reopen voting logic in db_stress when using MultiGet (#5374) Summary: When the --reopen option is non-zero, the DB is reopened after every ops_per_thread/(reopen+1) ops, with the check being done after every op. With MultiGet, we might do multiple ops in one iteration, which broke the logic that checked when to synchronize among the threads and reopen the DB. This PR fixes that logic. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5374 Differential Revision: D15559780 Pulled By: anand1976 fbshipit-source-id: ee6563a68045df7f367eca3cbc2500d3e26359ef --- tools/db_stress.cc | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 72461b13ab4..b9ab1a2df11 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -1967,13 +1967,18 @@ class StressTest { const int writeBound = prefixBound + (int)FLAGS_writepercent; const int delBound = writeBound + (int)FLAGS_delpercent; const int delRangeBound = delBound + (int)FLAGS_delrangepercent; + const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1); + int multiget_batch_size = 0; thread->stats.Start(); for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { if (thread->shared->HasVerificationFailedYet()) { break; } - if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { + // Check if the multiget batch crossed the ops_per_open boundary. If it + // did, then we should vote to reopen + if (i != 0 && (i % ops_per_open == 0 || + i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) { { thread->stats.FinishedSingleOp(); MutexLock l(thread->shared->GetMutex()); @@ -2168,7 +2173,7 @@ class StressTest { snap_state); } while (!thread->snapshot_queue.empty() && - i == thread->snapshot_queue.front().first) { + i >= thread->snapshot_queue.front().first) { auto snap_state = thread->snapshot_queue.front().second; assert(snap_state.snapshot); // Note: this is unsafe as the cf might be dropped concurrently. But it @@ -2185,13 +2190,24 @@ class StressTest { } int prob_op = thread->rand.Uniform(100); + // Reset this in case we pick something other than a read op. We don't + // want to use a stale value when deciding at the beginning of the loop + // whether to vote to reopen + multiget_batch_size = 0; if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { // OPERATION read if (FLAGS_use_multiget) { - int num_keys = thread->rand.Uniform(64); - rand_keys = GenerateNKeys(thread, num_keys, i); + // Leave room for one more iteration of the loop with a single key + // batch. This is to ensure that each thread does exactly the same + // number of ops + multiget_batch_size = static_cast( + std::min(static_cast(thread->rand.Uniform(64)), + FLAGS_ops_per_thread - i - 1)); + // If its the last iteration, ensure that multiget_batch_size is 1 + multiget_batch_size = std::max(multiget_batch_size, 1); + rand_keys = GenerateNKeys(thread, multiget_batch_size, i); TestMultiGet(thread, read_opts, rand_column_families, rand_keys); - i += num_keys - 1; + i += multiget_batch_size - 1; } else { TestGet(thread, read_opts, rand_column_families, rand_keys); } From 1e355842519debea764a8e04c5c08918dcc01d91 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 30 May 2019 11:49:36 -0700 Subject: [PATCH 083/572] Move the index readers out of the block cache (#5298) Summary: Currently, when the block cache is used for index blocks as well, it is not really the index block that is stored in the cache but an IndexReader object. Since this object is not pure data (it has, for instance, pointers that might dangle), it's not really sharable. To avoid the issues around this, the current code uses a dummy unique cache key for each TableReader to store the IndexReader, and erases the IndexReader entry when the TableReader is closed. Instead of doing this, the new code moves the IndexReader out of the cache altogether. In particular, instead of the TableReader owning, or caching/pinning the IndexReader based on the customer's settings, the TableReader unconditionally owns the IndexReader, which in turn owns/caches/pins the index block (which is itself sharable and thus can be safely put in the cache without any hacks). Note: the change has two side effects: 1) Partitions of partitioned indexes no longer affect the read amplification statistics. 2) Eviction statistics for index blocks are temporarily broken. We plan to fix this in a separate phase. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298 Differential Revision: D15303203 Pulled By: ltamasi fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47 --- HISTORY.md | 2 + db/db_block_cache_test.cc | 12 +- table/block_based_table_reader.cc | 1085 ++++++++++++++--------------- table/block_based_table_reader.h | 114 ++- table/table_test.cc | 67 +- 5 files changed, 590 insertions(+), 690 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 40d11096df0..55366b006fc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,8 @@ ## Unreleased ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Partitions of partitioned indexes no longer affect the read amplification statistics. +* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index f6e1aad323c..8eb73a23dd7 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -365,7 +365,10 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); // set the cache capacity to the current usage cache->SetCapacity(index_bytes_insert + filter_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); + // The index eviction statistics were broken by the refactoring that moved + // the index readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); // Note that the second key needs to be no longer than the first one. // Otherwise the second index block may not fit in cache. @@ -377,8 +380,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { index_bytes_insert); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), filter_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), - index_bytes_insert); + // The index eviction statistics were broken by the refactoring that moved + // the index readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), + // index_bytes_insert); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), filter_bytes_insert); } diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index a45fc0a5b47..82f96492662 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -120,16 +120,8 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) { } void DeleteCachedFilterEntry(const Slice& key, void* value); -void DeleteCachedIndexEntry(const Slice& key, void* value); void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); -// Release the cached entry and decrement its ref count. -void ReleaseCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - // Release the cached entry and decrement its ref count. void ForceReleaseCachedEntry(void* arg, void* h) { Cache* cache = reinterpret_cast(arg); @@ -137,17 +129,6 @@ void ForceReleaseCachedEntry(void* arg, void* h) { cache->Release(handle, true /* force_erase */); } -Slice GetCacheKeyFromOffset(const char* cache_key_prefix, - size_t cache_key_prefix_size, uint64_t offset, - char* cache_key) { - assert(cache_key != nullptr); - assert(cache_key_prefix_size != 0); - assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize); - memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset); - return Slice(cache_key, static_cast(end - cache_key)); -} - Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, int level, Tickers block_cache_miss_ticker, Tickers block_cache_hit_ticker, @@ -217,70 +198,193 @@ bool PrefixExtractorChanged(const TableProperties* table_properties, } // namespace +// Encapsulates common functionality for the various index reader +// implementations. Provides access to the index block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { +public: + IndexReaderCommon(BlockBasedTable* t, + CachableEntry&& index_block) + : table_(t) + , index_block_(std::move(index_block)) + { + assert(table_ != nullptr); + } + +protected: + static Status ReadIndexBlock(BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, + GetContext* get_context, CachableEntry* index_block); + + BlockBasedTable* table() const { return table_; } + + const InternalKeyComparator* internal_comparator() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + return &table_->get_rep()->internal_comparator; + } + + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + const TableProperties* const properties = + table_->get_rep()->table_properties.get(); + + return properties == nullptr || !properties->index_key_is_user_key; + } + + bool index_value_is_full() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + const TableProperties* const properties = + table_->get_rep()->table_properties.get(); + + return properties == nullptr || !properties->index_value_is_delta_encoded; + } + + Status GetOrReadIndexBlock(const ReadOptions& read_options, + GetContext* get_context, + CachableEntry* index_block) const; + + size_t ApproximateIndexBlockMemoryUsage() const { + assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); + return index_block_.GetOwnValue() ? + index_block_.GetValue()->ApproximateMemoryUsage() : 0; + } + +private: + BlockBasedTable* table_; + CachableEntry index_block_; +}; + +Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( + BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + CachableEntry* index_block) { + + PERF_TIMER_GUARD(read_index_block_nanos); + + assert(table != nullptr); + assert(index_block != nullptr); + assert(index_block->IsEmpty()); + + const Rep* const rep = table->get_rep(); + assert(rep != nullptr); + + constexpr bool is_index = true; + const Status s = BlockBasedTable::RetrieveBlock(prefetch_buffer, + rep, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), index_block, is_index, get_context); + + return s; +} + +Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( + const ReadOptions& read_options, GetContext* get_context, + CachableEntry* index_block) const { + + assert(index_block != nullptr); + + if (!index_block_.IsEmpty()) { + *index_block = CachableEntry(index_block_.GetValue(), + nullptr /* cache */, nullptr /* cache_handle */, false /* own_value */); + return Status::OK(); + } + + return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, + read_options, get_context, index_block); +} + // Index that allows binary search lookup in a two-level index structure. -class PartitionIndexReader : public IndexReader { +class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { public: // Read the partition index from the file and create an instance for // `PartitionIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(BlockBasedTable* table, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const BlockHandle& index_handle, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - IndexReader** index_reader, - const PersistentCacheOptions& cache_options, - const int level, const bool index_key_includes_seq, - const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); + static Status Create(BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, IndexReader** index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, &index_block); + if (!s.ok()) { + return s; + } - if (s.ok()) { - *index_reader = new PartitionIndexReader( - table, icomparator, std::move(index_block), ioptions.statistics, - level, index_key_includes_seq, index_value_is_full); + if (use_cache && !pin) { + index_block.Reset(); + } } - return s; + *index_reader = new PartitionIndexReader(table, std::move(index_block)); + + return Status::OK(); } // return a two-level iterator: first level is on the partition index InternalIteratorBase* NewIterator( - IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, - bool fill_cache = true) override { + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context) override { + + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(read_options, get_context, + &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + InternalIteratorBase* it = nullptr; + Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index if (!partition_map_.empty()) { - // We don't return pinned datat from index blocks, so no need + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return NewTwoLevelIterator( + it = NewTwoLevelIterator( new BlockBasedTable::PartitionedIndexIteratorState( - table_, &partition_map_, index_key_includes_seq_, - index_value_is_full_), - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_, index_value_is_full_)); + table(), &partition_map_, index_key_includes_seq(), + index_value_is_full()), + index_block.GetValue()->NewIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_key_includes_seq(), + index_value_is_full())); } else { - auto ro = ReadOptions(); - ro.fill_cache = fill_cache; - bool kIsIndex = true; - // We don't return pinned datat from index blocks, so no need + ReadOptions ro; + ro.fill_cache = read_options.fill_cache; + constexpr bool is_index = true; + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return new BlockBasedTableIterator( - table_, ro, *icomparator_, - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_, index_value_is_full_), - false, true, /* prefix_extractor */ nullptr, kIsIndex, - index_key_includes_seq_, index_value_is_full_); + it = new BlockBasedTableIterator( + table(), ro, *internal_comparator(), + index_block.GetValue()->NewIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_key_includes_seq(), + index_value_is_full()), + false, true, /* prefix_extractor */ nullptr, is_index, + index_key_includes_seq(), index_value_is_full()); } + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes // the first level iter is always on heap and will attempt to delete it @@ -289,15 +393,26 @@ class PartitionIndexReader : public IndexReader { void CacheDependencies(bool pin) override { // Before read partitions, prefetch them to avoid lots of IOs - auto rep = table_->rep_; + auto rep = table()->rep_; IndexBlockIter biter; BlockHandle handle; Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need + + CachableEntry index_block; + Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */, + &index_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level index block while trying to " + "cache index partitions: %s", s.ToString().c_str()); + return; + } + + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + index_block.GetValue()->NewIterator( + internal_comparator(), internal_comparator()->user_comparator(), &biter, + kNullStats, true, index_key_includes_seq(), index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -318,10 +433,10 @@ class PartitionIndexReader : public IndexReader { uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - auto& file = table_->rep_->file; + auto& file = rep->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); + s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); @@ -332,7 +447,7 @@ class PartitionIndexReader : public IndexReader { const bool is_index = true; // TODO: Support counter batch update for partitioned index and // filter blocks - s = table_->MaybeReadBlockAndLoadToCache( + s = BlockBasedTable::MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), rep, ro, handle, UncompressionDict::GetEmptyDict(), &block, is_index, nullptr /* get_context */); @@ -348,12 +463,8 @@ class PartitionIndexReader : public IndexReader { } } - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } - size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); + size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE usage += malloc_usable_size((void*)this); #else @@ -364,78 +475,79 @@ class PartitionIndexReader : public IndexReader { } private: - PartitionIndexReader(BlockBasedTable* table, - const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/, const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - table_(table), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } - BlockBasedTable* table_; - std::unique_ptr index_block_; + PartitionIndexReader(BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) + {} + std::unordered_map> partition_map_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; }; // Index that allows binary search lookup for the first key of each block. // This class can be viewed as a thin wrapper for `Block` class which already // supports binary search. -class BinarySearchIndexReader : public IndexReader { +class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { public: // Read index from the file and create an intance for // `BinarySearchIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const BlockHandle& index_handle, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - IndexReader** index_reader, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq, - const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); + static Status Create(BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, IndexReader** index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, &index_block); + if (!s.ok()) { + return s; + } - if (s.ok()) { - *index_reader = new BinarySearchIndexReader( - icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq, index_value_is_full); + if (use_cache && !pin) { + index_block.Reset(); + } } - return s; + *index_reader = new BinarySearchIndexReader(table, std::move(index_block)); + + return Status::OK(); } InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, - bool /*dont_care*/ = true) override { + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context) override { + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(read_options, get_context, + &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - } + auto it = index_block.GetValue()->NewIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, true, index_key_includes_seq(), index_value_is_full()); - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); + size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE usage += malloc_usable_size((void*)this); #else @@ -445,60 +557,51 @@ class BinarySearchIndexReader : public IndexReader { } private: - BinarySearchIndexReader(const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, - Statistics* stats, const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } - std::unique_ptr index_block_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; + BinarySearchIndexReader(BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) + {} }; // Index that leverages an internal hash table to quicken the lookup for a given // key. -class HashIndexReader : public IndexReader { +class HashIndexReader : public BlockBasedTable::IndexReaderCommon { public: - static Status Create( - const SliceTransform* hash_key_extractor, const Footer& footer, - RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, const BlockHandle& index_handle, - InternalIterator* meta_index_iter, IndexReader** index_reader, - bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq, const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); + static Status Create(BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, bool use_cache, + bool prefetch, bool pin, IndexReader** index_reader) { + assert(table != nullptr); + assert(index_reader != nullptr); + assert(!pin || prefetch); + + auto rep = table->get_rep(); + assert(rep != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, &index_block); + if (!s.ok()) { + return s; + } - if (!s.ok()) { - return s; + if (use_cache && !pin) { + index_block.Reset(); + } } // Note, failure to create prefix hash index does not need to be a // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = new HashIndexReader( - icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq, index_value_is_full); + auto new_index_reader = new HashIndexReader(table, std::move(index_block)); *index_reader = new_index_reader; // Get prefixes block BlockHandle prefixes_handle; - s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, - &prefixes_handle); + Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, + &prefixes_handle); if (!s.ok()) { // TODO: log error return Status::OK(); @@ -513,6 +616,13 @@ class HashIndexReader : public IndexReader { return Status::OK(); } + RandomAccessFileReader* const file = rep->file.get(); + const Footer& footer = rep->footer; + const ImmutableCFOptions& ioptions = rep->ioptions; + const PersistentCacheOptions& cache_options = rep->persistent_cache_options; + MemoryAllocator* const memory_allocator = + GetMemoryAllocator(rep->table_options); + // Read contents for the blocks BlockContents prefixes_contents; BlockFetcher prefixes_block_fetcher( @@ -537,7 +647,8 @@ class HashIndexReader : public IndexReader { } BlockPrefixIndex* prefix_index = nullptr; - s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data, + s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + prefixes_contents.data, prefixes_meta_contents.data, &prefix_index); // TODO: log error if (s.ok()) { @@ -548,24 +659,39 @@ class HashIndexReader : public IndexReader { } InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool total_order_seek = true, - bool /*dont_care*/ = true) override { + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context) override { + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(read_options, get_context, + &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need + const bool total_order_seek = read_options.total_order_seek || + disable_prefix_seek; + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), iter, kNullStats, - total_order_seek, index_key_includes_seq_, index_value_is_full_, - false /* block_contents_pinned */, prefix_index_.get()); - } + auto it = index_block.GetValue()->NewIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, total_order_seek, index_key_includes_seq(), + index_value_is_full(), false /* block_contents_pinned */, + prefix_index_.get()); - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); - usage += prefixes_contents_.usable_size(); + size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE usage += malloc_usable_size((void*)this); #else @@ -578,37 +704,22 @@ class HashIndexReader : public IndexReader { } private: - HashIndexReader(const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, Statistics* stats, - const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } + HashIndexReader(BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) + {} - ~HashIndexReader() override {} - - std::unique_ptr index_block_; std::unique_ptr prefix_index_; - BlockContents prefixes_contents_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; }; // Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) { +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); - // Create dummy offset of index reader which is beyond the file size. - rep->dummy_index_reader_offset = - file_size + rep->table_options.block_cache->NewId(); } if (rep->table_options.persistent_cache != nullptr) { GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), @@ -814,7 +925,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // handle prefix correctly. rep->internal_prefix_transform.reset( new InternalKeySliceTransform(prefix_extractor)); - SetupCacheKeyPrefix(rep, file_size); + SetupCacheKeyPrefix(rep); std::unique_ptr new_table(new BlockBasedTable(rep)); // page cache options @@ -848,9 +959,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, return s; } s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(), - new_table.get(), prefix_extractor, - prefetch_all, table_options, level, - prefetch_index_and_filter_in_cache); + new_table.get(), prefetch_all, table_options, + level); if (s.ok()) { // Update tail prefetch stats @@ -1116,9 +1226,8 @@ Status BlockBasedTable::ReadCompressionDictBlock( Status BlockBasedTable::PrefetchIndexAndFilterBlocks( Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - BlockBasedTable* new_table, const SliceTransform* prefix_extractor, - bool prefetch_all, const BlockBasedTableOptions& table_options, - const int level, const bool prefetch_index_and_filter_in_cache) { + BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level) { Status s; // Find filter handle and filter type @@ -1157,10 +1266,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( &rep->compression_dict_handle); } - bool need_upper_bound_check = - PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor); - BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); + + const bool use_cache = table_options.cache_index_and_filter_blocks; + // prefetch the first level of index const bool prefetch_index = prefetch_all || @@ -1183,39 +1292,34 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( const bool pin_filter = pin_all || (table_options.pin_top_level_index_and_filter && rep->filter_type == Rep::FilterType::kPartitionedFilter); + + IndexReader* index_reader = nullptr; + if (s.ok()) { + s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, &index_reader); + if (s.ok()) { + assert(index_reader != nullptr); + rep->index_reader.reset(index_reader); + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all) { + rep->index_reader->CacheDependencies(pin_all); + } + } else { + delete index_reader; + index_reader = nullptr; + } + } + // pre-fetching of blocks is turned on // Will use block cache for meta-blocks access // Always prefetch index and filter for level 0 // TODO(ajkr): also prefetch compression dictionary block + // TODO(ajkr): also pin compression dictionary block when + // `pin_l0_filter_and_index_blocks_in_cache == true`. if (table_options.cache_index_and_filter_blocks) { assert(table_options.block_cache != nullptr); - if (prefetch_index) { - // Hack: Call NewIndexIterator() to implicitly add index to the - // block_cache - CachableEntry index_entry; - // check prefix_extractor match only if hash based index is used - bool disable_prefix_seek = - rep->index_type == BlockBasedTableOptions::kHashSearch && - need_upper_bound_check; - if (s.ok()) { - std::unique_ptr> iter( - new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, - nullptr, &index_entry)); - s = iter->status(); - } - if (s.ok()) { - // This is the first call to NewIndexIterator() since we're in Open(). - // On success it should give us ownership of the `CachableEntry` by - // populating `index_entry`. - assert(index_entry.GetValue() != nullptr); - if (prefetch_all) { - index_entry.GetValue()->CacheDependencies(pin_all); - } - if (pin_index) { - rep->index_entry = std::move(index_entry); - } - } - } if (s.ok() && prefetch_filter) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = @@ -1232,24 +1336,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } } else { - // If we don't use block cache for meta-block access, we'll pre-load these - // blocks, which will kept in member variables in Rep and with a same life- - // time as this table object. - IndexReader* index_reader = nullptr; - if (s.ok()) { - s = new_table->CreateIndexReader(prefetch_buffer, &index_reader, - meta_iter, level); - } std::unique_ptr compression_dict_block; if (s.ok()) { - rep->index_reader.reset(index_reader); - // The partitions of partitioned index are always stored in cache. They - // are hence follow the configuration for pin and prefetch regardless of - // the value of cache_index_and_filter_blocks - if (prefetch_index_and_filter_in_cache || level == 0) { - rep->index_reader->CacheDependencies(pin_all); - } - // Set filter block if (rep->filter_policy) { const bool is_a_filter_partition = true; @@ -1259,14 +1347,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep->filter.reset(filter); // Refer to the comment above about paritioned indexes always being // cached - if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { + if (filter && prefetch_all) { filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); } } s = ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block); - } else { - delete index_reader; } if (s.ok() && !rep->compression_dict_handle.IsNull()) { assert(compression_dict_block != nullptr); @@ -1350,7 +1436,7 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Rep* rep, + Cache* block_cache, Cache* block_cache_compressed, const Rep* rep, const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index, GetContext* get_context) { @@ -1379,6 +1465,10 @@ Status BlockBasedTable::GetDataBlockFromCache( : nullptr, statistics, get_context); if (cache_handle != nullptr) { + if (is_index) { + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + } + block->SetCachedValue( reinterpret_cast(block_cache->Value(cache_handle)), block_cache, cache_handle); @@ -1843,119 +1933,15 @@ BlockBasedTable::GetUncompressionDict(Rep* rep, // differs from the one in mutable_cf_options and index type is HashBasedIndex InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* input_iter, CachableEntry* index_entry, - GetContext* get_context) { - // index reader has already been pre-populated. - if (rep_->index_reader) { - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return rep_->index_reader->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek, - read_options.fill_cache); - } - // we have a pinned index block - if (rep_->index_entry.IsCached()) { - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return rep_->index_entry.GetValue()->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek, - read_options.fill_cache); - } - - PERF_TIMER_GUARD(read_index_block_nanos); - - const bool no_io = read_options.read_tier == kBlockCacheTier; - Cache* block_cache = rep_->table_options.block_cache.get(); - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = - GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->dummy_index_reader_offset, cache_key); - Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = GetEntryFromCache( - block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS, - BLOCK_CACHE_INDEX_HIT, - get_context ? &get_context->get_context_stats_.num_cache_index_miss - : nullptr, - get_context ? &get_context->get_context_stats_.num_cache_index_hit - : nullptr, - statistics, get_context); - - if (cache_handle == nullptr && no_io) { - if (input_iter != nullptr) { - input_iter->Invalidate(Status::Incomplete("no blocking io")); - return input_iter; - } else { - return NewErrorInternalIterator( - Status::Incomplete("no blocking io")); - } - } + IndexBlockIter* input_iter, GetContext* get_context) { - IndexReader* index_reader = nullptr; - if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_index_hit_count, 1); - index_reader = - reinterpret_cast(block_cache->Value(cache_handle)); - } else { - // Create index reader and put it in the cache. - Status s; - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2"); - s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1"); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3"); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4"); - size_t charge = 0; - if (s.ok()) { - assert(index_reader != nullptr); - charge = index_reader->ApproximateMemoryUsage(); - s = block_cache->Insert( - key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - } - - if (s.ok()) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - PERF_COUNTER_ADD(index_block_read_count, 1); - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } else { - if (index_reader != nullptr) { - delete index_reader; - } - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - // make sure if something goes wrong, index_reader shall remain intact. - if (input_iter != nullptr) { - input_iter->Invalidate(s); - return input_iter; - } else { - return NewErrorInternalIterator(s); - } - } - } + assert(rep_ != nullptr); + assert(rep_->index_reader != nullptr); - assert(cache_handle); - // We don't return pinned datat from index blocks, so no need + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - auto* iter = index_reader->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek); - - // the caller would like to take ownership of the index block - // don't call RegisterCleanup() in this case, the caller will take care of it - if (index_entry != nullptr) { - *index_entry = {index_reader, block_cache, cache_handle, - false /* own_value */}; - } else { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); - } - - return iter; + return rep_->index_reader->NewIterator(read_options, disable_prefix_seek, + input_iter, get_context); } // Convert an index iterator value (i.e., an encoded BlockHandle) @@ -1970,118 +1956,85 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( FilePrefetchBuffer* prefetch_buffer) { PERF_TIMER_GUARD(new_table_block_iter_nanos); - Cache* block_cache = rep->table_options.block_cache.get(); + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + const bool no_io = (ro.read_tier == kBlockCacheTier); + auto uncompression_dict_storage = + GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); + const UncompressionDict& uncompression_dict = + uncompression_dict_storage.GetValue() == nullptr + ? UncompressionDict::GetEmptyDict() + : *uncompression_dict_storage.GetValue(); + CachableEntry block; - TBlockIter* iter; - { - const bool no_io = (ro.read_tier == kBlockCacheTier); - auto uncompression_dict_storage = - GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); - const UncompressionDict& uncompression_dict = - uncompression_dict_storage.GetValue() == nullptr - ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.GetValue(); - if (s.ok()) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, - uncompression_dict, &block, is_index, - get_context); - } + s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict, + &block, is_index, get_context); - if (input_iter != nullptr) { - iter = input_iter; - } else { - iter = new TBlockIter; - } - // Didn't get any data from block caches. - if (s.ok() && block.GetValue() == nullptr) { - if (no_io) { - // Could not read from block_cache and can't do IO - iter->Invalidate(Status::Incomplete("no blocking io")); - return iter; - } - std::unique_ptr block_value; - { - StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, - READ_BLOCK_GET_MICROS); - s = ReadBlockFromFile( - rep->file.get(), prefetch_buffer, rep->footer, ro, handle, - &block_value, rep->ioptions, - rep->blocks_maybe_compressed /*do_decompress*/, - rep->blocks_maybe_compressed, uncompression_dict, - rep->persistent_cache_options, - is_index ? kDisableGlobalSequenceNumber : rep->global_seqno, - rep->table_options.read_amp_bytes_per_bit, - GetMemoryAllocator(rep->table_options)); - } + if (!s.ok()) { + assert(block.IsEmpty()); + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + constexpr bool kTotalOrderSeek = true; + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = block.IsCached() || + (!block.GetValue()->own_bytes() && rep->immortal_table); + iter = block.GetValue()->NewIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full, block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep->cache_key_prefix_size != 0); + assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); if (s.ok()) { - block.SetOwnedValue(block_value.release()); + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); } } - // TODO(ajkr): also pin compression dictionary block when - // `pin_l0_filter_and_index_blocks_in_cache == true`. } - if (s.ok()) { - assert(block.GetValue() != nullptr); - const bool kTotalOrderSeek = true; - // Block contents are pinned and it is still pinned after the iterator - // is destroyed as long as cleanup functions are moved to another object, - // when: - // 1. block cache handle is set to be released in cleanup function, or - // 2. it's pointing to immortal source. If own_bytes is true then we are - // not reading data from the original source, whether immortal or not. - // Otherwise, the block is pinned iff the source is immortal. - bool block_contents_pinned = - (block.IsCached() || - (!block.GetValue()->own_bytes() && rep->immortal_table)); - iter = block.GetValue()->NewIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, - index_key_is_full, block_contents_pinned); - if (!block.IsCached()) { - if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { - // insert a dummy record to block cache to track the memory usage - Cache::Handle* cache_handle; - // There are two other types of cache keys: 1) SST cache key added in - // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in - // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate - // from SST cache key(31 bytes), and use non-zero prefix to - // differentiate from `write_buffer_manager` - const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; - char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; - // Prefix: use rep->cache_key_prefix padded by 0s - memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); - assert(rep->cache_key_prefix_size != 0); - assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix); - memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, - next_cache_key_id_++); - assert(end - cache_key <= - static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); - Slice unique_key = - Slice(cache_key, static_cast(end - cache_key)); - s = block_cache->Insert(unique_key, nullptr, - block.GetValue()->ApproximateMemoryUsage(), - nullptr, &cache_handle); - if (s.ok()) { - if (cache_handle != nullptr) { - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, - cache_handle); - } - } - } - } - - block.TransferTo(iter); - } else { - assert(block.GetValue() == nullptr); - iter->Invalidate(s); - } + block.TransferTo(iter); return iter; } Status BlockBasedTable::MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, bool is_index, GetContext* get_context) { assert(block_entry != nullptr); @@ -2116,7 +2069,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, rep, ro, block_entry, uncompression_dict, - rep->table_options.read_amp_bytes_per_bit, + !is_index ? + rep->table_options.read_amp_bytes_per_bit : 0, is_index, get_context); // Can't find the block from the cache. If I/O is allowed, read from the @@ -2148,7 +2102,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, block_entry, &raw_block_contents, raw_block_comp_type, rep->table_options.format_version, uncompression_dict, seq_no, - rep->table_options.read_amp_bytes_per_bit, + !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, GetMemoryAllocator(rep->table_options), is_index, is_index && rep->table_options .cache_index_and_filter_blocks_with_high_priority @@ -2162,6 +2116,64 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( return s; } +Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, bool is_index, GetContext* get_context) { + + assert(rep); + assert(block_entry); + assert(block_entry->IsEmpty()); + + Status s; + if (!is_index || rep->table_options.cache_index_and_filter_blocks) { + s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, + uncompression_dict, block_entry, + is_index, get_context); + + if (!s.ok()) { + return s; + } + + if (block_entry->GetValue() != nullptr) { + assert (s.ok()); + return s; + } + } + + assert(block_entry->IsEmpty()); + + const bool no_io = ro.read_tier == kBlockCacheTier; + if (no_io) { + return Status::Incomplete("no blocking io"); + } + + std::unique_ptr block; + + { + StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, + READ_BLOCK_GET_MICROS); + s = ReadBlockFromFile(rep->file.get(), prefetch_buffer, rep->footer, ro, + handle, &block, rep->ioptions, + rep->blocks_maybe_compressed, + rep->blocks_maybe_compressed, uncompression_dict, + rep->persistent_cache_options, + rep->get_global_seqno(is_index), + !is_index ? + rep->table_options.read_amp_bytes_per_bit : 0, + GetMemoryAllocator(rep->table_options)); + } + + if (!s.ok()) { + return s; + } + + block_entry->SetOwnedValue(block.release()); + + assert(s.ok()); + return s; +} + BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, @@ -2188,7 +2200,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, block_cache->GetUsage(block->second.GetCacheHandle())); Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need + // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. return block->second.GetValue()->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), @@ -2747,7 +2759,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, - /* index_entry */ nullptr, get_context); + get_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2868,7 +2880,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, } auto iiter = NewIndexIterator( read_options, need_upper_bound_check, &iiter_on_stack, - /* index_entry */ nullptr, sst_file_range.begin()->get_context); + sst_file_range.begin()->get_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -3085,45 +3097,37 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( return s; } +bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { + assert(rep_ != nullptr); + + Cache* const cache = rep_->table_options.block_cache.get(); + if (cache == nullptr) { + return false; + } + + char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice cache_key = GetCacheKey(rep_->cache_key_prefix, + rep_->cache_key_prefix_size, handle, + cache_key_storage); + + Cache::Handle* const cache_handle = cache->Lookup(cache_key); + if (cache_handle == nullptr) { + return false; + } + + cache->Release(cache_handle); + + return true; +} + bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr> iiter( NewIndexIterator(options)); iiter->Seek(key); assert(iiter->Valid()); - CachableEntry block; - - BlockHandle handle = iiter->value(); - Cache* block_cache = rep_->table_options.block_cache.get(); - assert(block_cache != nullptr); - - char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, - cache_key_storage); - Slice ckey; - Status s; - if (!rep_->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, - &compression_dict_block); - if (s.ok()) { - assert(compression_dict_block != nullptr); - UncompressionDict uncompression_dict( - compression_dict_block->data.ToString(), - rep_->blocks_definitely_zstd_compressed); - s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_, - options, &block, uncompression_dict, - 0 /* read_amp_bytes_per_bit */); - } - } else { - s = GetDataBlockFromCache( - cache_key, ckey, block_cache, nullptr, rep_, options, &block, - UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */); - } - assert(s.ok()); - return block.IsCached(); + return TEST_BlockInCache(iiter->value()); } BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { @@ -3151,14 +3155,11 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { // 4. internal_comparator // 5. index_type Status BlockBasedTable::CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, - InternalIterator* preloaded_meta_index_iter, int level) { + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, + bool pin, IndexReader** index_reader) { auto index_type_on_file = rep_->index_type; - auto file = rep_->file.get(); - const InternalKeyComparator* icomparator = &rep_->internal_comparator; - const Footer& footer = rep_->footer; - // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. @@ -3167,25 +3168,12 @@ Status BlockBasedTable::CreateIndexReader( switch (index_type_on_file) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { - return PartitionIndexReader::Create( - this, file, prefetch_buffer, footer, footer.index_handle(), - rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options, level, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); + return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, index_reader); } case BlockBasedTableOptions::kBinarySearch: { - return BinarySearchIndexReader::Create( - file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, - icomparator, index_reader, rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, index_reader); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -3200,29 +3188,15 @@ Status BlockBasedTable::CreateIndexReader( ROCKS_LOG_WARN(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary search index."); - return BinarySearchIndexReader::Create( - file, prefetch_buffer, footer, footer.index_handle(), - rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); + return BinarySearchIndexReader::Create(this, prefetch_buffer, + use_cache, prefetch, pin, + index_reader); } meta_index_iter = meta_iter_guard.get(); } - return HashIndexReader::Create( - rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer, - rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter, - index_reader, rep_->hash_index_allow_collision, - rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); + return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + use_cache, prefetch, pin, index_reader); } default: { std::string error_message = @@ -3261,8 +3235,10 @@ bool BlockBasedTable::TEST_filter_block_preloaded() const { return rep_->filter != nullptr; } -bool BlockBasedTable::TEST_index_reader_preloaded() const { - return rep_->index_reader != nullptr; +bool BlockBasedTable::TEST_IndexBlockInCache() const { + assert(rep_ != nullptr); + + return TEST_BlockInCache(rep_->footer.index_handle()); } Status BlockBasedTable::GetKVPairsFromDataBlocks( @@ -3479,12 +3455,6 @@ void BlockBasedTable::Close() { rep_->filter_handle, cache_key); cache->Erase(key); - // Get the index block key - key = GetCacheKeyFromOffset(rep_->cache_key_prefix, - rep_->cache_key_prefix_size, - rep_->dummy_index_reader_offset, cache_key); - cache->Erase(key); - if (!rep_->compression_dict_handle.IsNull()) { // Get the compression dictionary block key key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, @@ -3674,15 +3644,6 @@ void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { delete filter; } -void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) { - IndexReader* index_reader = reinterpret_cast(value); - if (index_reader->statistics() != nullptr) { - RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, - index_reader->ApproximateMemoryUsage()); - } - delete index_reader; -} - void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { UncompressionDict* dict = reinterpret_cast(value); RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 270409b3ab6..54ce34d617b 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -150,6 +150,8 @@ class BlockBasedTable : public TableReader { // be close to the file length. uint64_t ApproximateOffsetOf(const Slice& key) override; + bool TEST_BlockInCache(const BlockHandle& handle) const; + // Returns true if the block for the specified key is in cache. // REQUIRES: key is in this table && block cache enabled bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); @@ -173,54 +175,35 @@ class BlockBasedTable : public TableReader { ~BlockBasedTable(); bool TEST_filter_block_preloaded() const; - bool TEST_index_reader_preloaded() const; + bool TEST_IndexBlockInCache() const; - // IndexReader is the interface that provide the functionality for index + // IndexReader is the interface that provides the functionality for index // access. class IndexReader { public: - explicit IndexReader(const InternalKeyComparator* icomparator, - Statistics* stats) - : icomparator_(icomparator), statistics_(stats) {} - - virtual ~IndexReader() {} - - // Create an iterator for index access. - // If iter is null then a new object is created on heap and the callee will - // have the ownership. If a non-null iter is passed in it will be used, and - // the returned value is either the same as iter or a new on-heap object - // that - // wrapps the passed iter. In the latter case the return value would point - // to - // a different object then iter and the callee has the ownership of the + virtual ~IndexReader() = default; + + // Create an iterator for index access. If iter is null, then a new object + // is created on the heap, and the callee will have the ownership. + // If a non-null iter is passed in, it will be used, and the returned value + // is either the same as iter or a new on-heap object that + // wraps the passed iter. In the latter case the return value points + // to a different object then iter, and the callee has the ownership of the // returned object. virtual InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool total_order_seek = true, - bool fill_cache = true) = 0; - - // The size of the index. - virtual size_t size() const = 0; - // Memory usage of the index block - virtual size_t usable_size() const = 0; - // return the statistics pointer - virtual Statistics* statistics() const { return statistics_; } + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context) = 0; + // Report an approximation of how much memory has been used other than - // memory - // that was allocated in block cache. + // memory that was allocated in block cache. virtual size_t ApproximateMemoryUsage() const = 0; - - virtual void CacheDependencies(bool /* unused */) {} - - // Prefetch all the blocks referenced by this index to the buffer - void PrefetchBlocks(FilePrefetchBuffer* buf); - - protected: - const InternalKeyComparator* icomparator_; - - private: - Statistics* statistics_; + // Cache the dependencies of the index reader (e.g. the partitions + // of a partitioned index). + virtual void CacheDependencies(bool /* pin */) {} }; + class IndexReaderCommon; + static Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, const BlockHandle& handle, char* cache_key); @@ -271,11 +254,22 @@ class BlockBasedTable : public TableReader { // in uncompressed block cache, also sets cache_handle to reference that // block. static Status MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, + FilePrefetchBuffer* prefetch_buffer, const Rep* rep, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, CachableEntry* block_entry, bool is_index = false, GetContext* get_context = nullptr); + // Similar to the above, with one crucial difference: it will retrieve the + // block from the file even if there are no caches configured (assuming the + // read options allow I/O). + static Status RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const Rep* rep, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, bool is_index, + GetContext* get_context); + // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file // were they not present in cache yet. @@ -305,7 +299,6 @@ class BlockBasedTable : public TableReader { InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check = false, IndexBlockIter* input_iter = nullptr, - CachableEntry* index_entry = nullptr, GetContext* get_context = nullptr); // Read block cache from block caches (if set): block_cache and @@ -316,7 +309,7 @@ class BlockBasedTable : public TableReader { // dictionary. static Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Rep* rep, + Cache* block_cache, Cache* block_cache_compressed, const Rep* rep, const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index = false, @@ -359,9 +352,9 @@ class BlockBasedTable : public TableReader { // need to access extra meta blocks for index construction. This parameter // helps avoid re-reading meta index block if caller already created one. Status CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, - InternalIterator* preloaded_meta_index_iter = nullptr, - const int level = -1); + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, bool use_cache, + bool prefetch, bool pin, IndexReader** index_reader); bool FullFilterKeyMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, @@ -398,9 +391,8 @@ class BlockBasedTable : public TableReader { static Status PrefetchIndexAndFilterBlocks( Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, - const SliceTransform* prefix_extractor, bool prefetch_all, - const BlockBasedTableOptions& table_options, const int level, - const bool prefetch_index_and_filter_in_cache); + bool prefetch_all, const BlockBasedTableOptions& table_options, + const int level); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); @@ -411,7 +403,7 @@ class BlockBasedTable : public TableReader { const bool is_a_filter_partition, const SliceTransform* prefix_extractor = nullptr) const; - static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); + static void SetupCacheKeyPrefix(Rep* rep); // Generate a cache key prefix from the file static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file, @@ -486,18 +478,21 @@ struct BlockBasedTable::Rep { size_t persistent_cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size = 0; - uint64_t dummy_index_reader_offset = - 0; // ID that is unique for the block cache. PersistentCacheOptions persistent_cache_options; // Footer contains the fixed table information Footer footer; - // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e., - // non-nullptr) and used only when options.block_cache is nullptr or when - // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index, - // filter, and compression dictionary blocks via the block cache. In that case - // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle` - // are used to lookup these meta-blocks in block cache. + // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr) + // and used only when options.block_cache is nullptr or when + // `cache_index_and_filter_blocks == false`. Otherwise, we will get the + // filter and compression dictionary blocks via the block cache. In that case, + // `filter_handle`, and `compression_dict_handle` are used to lookup these + // meta-blocks in block cache. + // + // Note: the IndexReader object is always stored in this member variable; + // the index block itself, however, may or may not be in the block cache + // based on the settings above. We plan to change the handling of the + // filter and compression dictionary similarly. std::unique_ptr index_reader; std::unique_ptr filter; std::unique_ptr uncompression_dict; @@ -526,12 +521,11 @@ struct BlockBasedTable::Rep { // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is // true or in all levels when pin_top_level_index_and_filter is set in - // combination with partitioned index/filters: then we do use the LRU cache, - // but we always keep the filter & index block's handle checked out here (=we + // combination with partitioned filters: then we do use the LRU cache, + // but we always keep the filter block's handle checked out here (=we // don't call Release()), plus the parsed out objects the LRU cache will never // push flush them out, hence they're pinned CachableEntry filter_entry; - CachableEntry index_entry; std::shared_ptr fragmented_range_dels; // If global_seqno is used, all Keys in this file will have the same diff --git a/table/table_test.cc b/table/table_test.cc index dccc4919409..aeb66f8d35f 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1993,7 +1993,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { // preloading filter/index blocks is enabled. auto reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(reader->TEST_filter_block_preloaded()); - ASSERT_TRUE(reader->TEST_index_reader_preloaded()); + ASSERT_FALSE(reader->TEST_IndexBlockInCache()); { // nothing happens in the beginning @@ -2040,7 +2040,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // preloading filter/index blocks is prohibited. auto* reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); - ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); + ASSERT_TRUE(reader->TEST_IndexBlockInCache()); // -- PART 1: Open with regular block cache. // Since block_cache is disabled, no cache activities will be involved. @@ -2612,69 +2612,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) { EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0); } -TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) { - // A regression test to avoid data race described in - // https://github.com/facebook/rocksdb/issues/1267 - TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); - std::vector keys; - stl_wrappers::KVMap kvmap; - c.Add("a1", "val1"); - Options options; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.index_type = BlockBasedTableOptions::kHashSearch; - table_options.cache_index_and_filter_blocks = true; - table_options.block_cache = NewLRUCache(0); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); - const MutableCFOptions moptions(options); - c.Finish(options, ioptions, moptions, table_options, - GetPlainInternalComparator(options.comparator), &keys, &kvmap); - - rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( - { - {"BlockBasedTable::NewIndexIterator::thread1:1", - "BlockBasedTable::NewIndexIterator::thread2:2"}, - {"BlockBasedTable::NewIndexIterator::thread2:3", - "BlockBasedTable::NewIndexIterator::thread1:4"}, - }, - { - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", - "BlockBasedTable::NewIndexIterator::thread1:1"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", - "BlockBasedTable::NewIndexIterator::thread1:4"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", - "BlockBasedTable::NewIndexIterator::thread2:2"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", - "BlockBasedTable::NewIndexIterator::thread2:3"}, - }); - - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions ro; - auto* reader = c.GetTableReader(); - - std::function func1 = [&]() { - TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker"); - // TODO(Zhongyi): update test to use MutableCFOptions - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); - iter->Seek(InternalKey("a1", 0, kTypeValue).Encode()); - }; - - std::function func2 = [&]() { - TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker"); - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); - }; - - auto thread1 = port::Thread(func1); - auto thread2 = port::Thread(func2); - thread1.join(); - thread2.join(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - c.ResetTableReader(); -} - // Plain table is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE TEST_F(PlainTableTest, BasicPlainTableProperties) { From e62986260f12abad62d84182d106daeb147168e7 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 30 May 2019 14:01:44 -0700 Subject: [PATCH 084/572] Fix env_options_for_read spelling in CompactionJob Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5380 Differential Revision: D15563386 Pulled By: sagar0 fbshipit-source-id: 8b26aef47cfc40ff8016daf815582f21cdd40df2 --- db/compaction_job.cc | 4 ++-- db/compaction_job.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 91c7f437a17..9e5d46f877d 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -324,7 +324,7 @@ CompactionJob::CompactionJob( db_options_(db_options), env_options_(env_options), env_(db_options.env), - env_optiosn_for_read_( + env_options_for_read_( env_->OptimizeForCompactionTableRead(env_options, db_options_)), versions_(versions), shutting_down_(shutting_down), @@ -836,7 +836,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. std::unique_ptr input(versions_->MakeInputIterator( - sub_compact->compaction, &range_del_agg, env_optiosn_for_read_)); + sub_compact->compaction, &range_del_agg, env_options_for_read_)); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PROCESS_KV); diff --git a/db/compaction_job.h b/db/compaction_job.h index a37c54de809..0751727d704 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -151,7 +151,7 @@ class CompactionJob { Env* env_; // env_option optimized for compaction table reads - EnvOptions env_optiosn_for_read_; + EnvOptions env_options_for_read_; VersionSet* versions_; const std::atomic* shutting_down_; const SequenceNumber preserve_deletes_seqnum_; From 50e470791dafb3db017f055f79323aef9a607e43 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Thu, 30 May 2019 14:47:29 -0700 Subject: [PATCH 085/572] Organizing rocksdb/table directory by format Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5373 Differential Revision: D15559425 Pulled By: vjnadimpalli fbshipit-source-id: 5d6d6d615582bedd96a4b879bb25d429a6de8b55 --- CMakeLists.txt | 57 ++++++++--------- Makefile | 14 ++--- TARGETS | 58 ++++++++--------- db/builder.cc | 2 +- db/column_family.cc | 2 +- db/compaction_job.cc | 4 +- db/compaction_job_stats_test.cc | 4 +- db/corruption_test.cc | 3 +- db/cuckoo_table_db_test.cc | 4 +- db/db_impl.cc | 4 +- db/db_impl_open.cc | 2 +- db/db_iterator_test.cc | 2 +- db/db_test.cc | 4 +- db/db_test_util.h | 5 +- db/flush_job.cc | 4 +- db/internal_stats.cc | 2 +- db/listener_test.cc | 5 +- db/plain_table_db_test.cc | 6 +- db/table_properties_collector_test.cc | 7 ++- db/version_set.cc | 2 +- options/options.cc | 2 +- options/options_helper.cc | 5 +- options/options_parser.h | 2 +- src.mk | 62 +++++++++---------- .../{ => adaptive}/adaptive_table_factory.cc | 2 +- table/{ => adaptive}/adaptive_table_factory.h | 0 table/{ => block_based}/block.cc | 6 +- table/{ => block_based}/block.h | 6 +- .../block_based_filter_block.cc | 2 +- .../block_based_filter_block.h | 3 +- .../block_based_filter_block_test.cc | 2 +- .../block_based_table_builder.cc | 20 +++--- .../block_based_table_builder.h | 2 +- .../block_based_table_factory.cc | 6 +- .../block_based_table_factory.h | 0 .../block_based_table_reader.cc | 21 +++---- .../block_based_table_reader.h | 8 +-- table/{ => block_based}/block_builder.cc | 4 +- table/{ => block_based}/block_builder.h | 2 +- table/{ => block_based}/block_prefix_index.cc | 2 +- table/{ => block_based}/block_prefix_index.h | 0 table/{ => block_based}/block_test.cc | 5 +- table/{ => block_based}/cachable_entry.h | 0 table/{ => block_based}/data_block_footer.cc | 2 +- table/{ => block_based}/data_block_footer.h | 0 .../data_block_hash_index.cc | 2 +- .../{ => block_based}/data_block_hash_index.h | 0 .../data_block_hash_index_test.cc | 9 +-- table/{ => block_based}/filter_block.h | 2 +- table/{ => block_based}/flush_block_policy.cc | 2 +- table/{ => block_based}/flush_block_policy.h | 0 table/{ => block_based}/full_filter_block.cc | 2 +- table/{ => block_based}/full_filter_block.h | 3 +- .../full_filter_block_test.cc | 2 +- table/{ => block_based}/index_builder.cc | 5 +- table/{ => block_based}/index_builder.h | 4 +- .../partitioned_filter_block.cc | 6 +- .../partitioned_filter_block.h | 11 ++-- .../partitioned_filter_block_test.cc | 8 ++- table/block_fetcher.cc | 4 +- table/block_fetcher.h | 2 +- table/{ => cuckoo}/cuckoo_table_builder.cc | 6 +- table/{ => cuckoo}/cuckoo_table_builder.h | 0 .../{ => cuckoo}/cuckoo_table_builder_test.cc | 2 +- table/{ => cuckoo}/cuckoo_table_factory.cc | 6 +- table/{ => cuckoo}/cuckoo_table_factory.h | 0 table/{ => cuckoo}/cuckoo_table_reader.cc | 4 +- table/{ => cuckoo}/cuckoo_table_reader.h | 0 .../{ => cuckoo}/cuckoo_table_reader_test.cc | 6 +- table/format.cc | 6 +- table/get_context.h | 2 +- table/meta_blocks.cc | 4 +- table/meta_blocks.h | 2 +- table/persistent_cache_helper.cc | 2 +- table/{ => plain}/plain_table_builder.cc | 8 +-- table/{ => plain}/plain_table_builder.h | 5 +- table/{ => plain}/plain_table_factory.cc | 6 +- table/{ => plain}/plain_table_factory.h | 0 table/{ => plain}/plain_table_index.cc | 2 +- table/{ => plain}/plain_table_index.h | 0 table/{ => plain}/plain_table_key_coding.cc | 6 +- table/{ => plain}/plain_table_key_coding.h | 3 +- table/{ => plain}/plain_table_reader.cc | 10 +-- table/{ => plain}/plain_table_reader.h | 4 +- table/sst_file_writer.cc | 3 +- table/table_properties.cc | 3 +- table/table_reader_bench.cc | 4 +- table/table_test.cc | 18 +++--- table/two_level_iterator.cc | 2 +- test_util/testutil.h | 4 +- tools/sst_dump_test.cc | 2 +- tools/sst_dump_tool.cc | 11 ++-- tools/trace_analyzer_tool.cc | 2 +- util/bloom.cc | 4 +- utilities/blob_db/blob_db_impl.cc | 7 ++- utilities/memory/memory_test.cc | 2 +- .../persistent_cache_bench.cc | 2 +- .../persistent_cache/persistent_cache_test.h | 2 +- 98 files changed, 292 insertions(+), 275 deletions(-) rename table/{ => adaptive}/adaptive_table_factory.cc (98%) rename table/{ => adaptive}/adaptive_table_factory.h (100%) rename table/{ => block_based}/block.cc (99%) rename table/{ => block_based}/block.h (99%) rename table/{ => block_based}/block_based_filter_block.cc (99%) rename table/{ => block_based}/block_based_filter_block.h (99%) rename table/{ => block_based}/block_based_filter_block_test.cc (99%) rename table/{ => block_based}/block_based_table_builder.cc (98%) rename table/{ => block_based}/block_based_table_builder.h (100%) rename table/{ => block_based}/block_based_table_factory.cc (99%) rename table/{ => block_based}/block_based_table_factory.h (100%) rename table/{ => block_based}/block_based_table_reader.cc (99%) rename table/{ => block_based}/block_based_table_reader.h (99%) rename table/{ => block_based}/block_builder.cc (98%) rename table/{ => block_based}/block_builder.h (98%) rename table/{ => block_based}/block_prefix_index.cc (99%) rename table/{ => block_based}/block_prefix_index.h (100%) rename table/{ => block_based}/block_test.cc (99%) rename table/{ => block_based}/cachable_entry.h (100%) rename table/{ => block_based}/data_block_footer.cc (97%) rename table/{ => block_based}/data_block_footer.h (100%) rename table/{ => block_based}/data_block_hash_index.cc (98%) rename table/{ => block_based}/data_block_hash_index.h (100%) rename table/{ => block_based}/data_block_hash_index_test.cc (99%) rename table/{ => block_based}/filter_block.h (99%) rename table/{ => block_based}/flush_block_policy.cc (98%) rename table/{ => block_based}/flush_block_policy.h (100%) rename table/{ => block_based}/full_filter_block.cc (99%) rename table/{ => block_based}/full_filter_block.h (99%) rename table/{ => block_based}/full_filter_block_test.cc (99%) rename table/{ => block_based}/index_builder.cc (98%) rename table/{ => block_based}/index_builder.h (99%) rename table/{ => block_based}/partitioned_filter_block.cc (98%) rename table/{ => block_based}/partitioned_filter_block.h (95%) rename table/{ => block_based}/partitioned_filter_block_test.cc (99%) rename table/{ => cuckoo}/cuckoo_table_builder.cc (99%) rename table/{ => cuckoo}/cuckoo_table_builder.h (100%) rename table/{ => cuckoo}/cuckoo_table_builder_test.cc (99%) rename table/{ => cuckoo}/cuckoo_table_factory.cc (94%) rename table/{ => cuckoo}/cuckoo_table_factory.h (100%) rename table/{ => cuckoo}/cuckoo_table_reader.cc (99%) rename table/{ => cuckoo}/cuckoo_table_reader.h (100%) rename table/{ => cuckoo}/cuckoo_table_reader_test.cc (99%) rename table/{ => plain}/plain_table_builder.cc (98%) rename table/{ => plain}/plain_table_builder.h (98%) rename table/{ => plain}/plain_table_factory.cc (98%) rename table/{ => plain}/plain_table_factory.h (100%) rename table/{ => plain}/plain_table_index.cc (99%) rename table/{ => plain}/plain_table_index.h (100%) rename table/{ => plain}/plain_table_key_coding.cc (99%) rename table/{ => plain}/plain_table_key_coding.h (99%) rename table/{ => plain}/plain_table_reader.cc (99%) rename table/{ => plain}/plain_table_reader.h (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6449047fca6..5614c83b44a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -566,36 +566,36 @@ set(SOURCES options/options_parser.cc options/options_sanity_check.cc port/stack_trace.cc - table/adaptive_table_factory.cc - table/block.cc - table/block_based_filter_block.cc - table/block_based_table_builder.cc - table/block_based_table_factory.cc - table/block_based_table_reader.cc - table/block_builder.cc + table/adaptive/adaptive_table_factory.cc + table/block_based/block.cc + table/block_based/block_based_filter_block.cc + table/block_based/block_based_table_builder.cc + table/block_based/block_based_table_factory.cc + table/block_based/block_based_table_reader.cc + table/block_based/block_builder.cc + table/block_based/block_prefix_index.cc + table/block_based/data_block_hash_index.cc + table/block_based/data_block_footer.cc + table/block_based/flush_block_policy.cc + table/block_based/full_filter_block.cc + table/block_based/index_builder.cc + table/block_based/partitioned_filter_block.cc table/block_fetcher.cc - table/block_prefix_index.cc table/bloom_block.cc - table/cuckoo_table_builder.cc - table/cuckoo_table_factory.cc - table/cuckoo_table_reader.cc - table/data_block_hash_index.cc - table/data_block_footer.cc - table/flush_block_policy.cc + table/cuckoo/cuckoo_table_builder.cc + table/cuckoo/cuckoo_table_factory.cc + table/cuckoo/cuckoo_table_reader.cc table/format.cc - table/full_filter_block.cc table/get_context.cc - table/index_builder.cc table/iterator.cc table/merging_iterator.cc table/meta_blocks.cc - table/partitioned_filter_block.cc table/persistent_cache_helper.cc - table/plain_table_builder.cc - table/plain_table_factory.cc - table/plain_table_index.cc - table/plain_table_key_coding.cc - table/plain_table_reader.cc + table/plain/plain_table_builder.cc + table/plain/plain_table_factory.cc + table/plain/plain_table_index.cc + table/plain/plain_table_key_coding.cc + table/plain/plain_table_reader.cc table/sst_file_reader.cc table/sst_file_writer.cc table/table_properties.cc @@ -940,13 +940,14 @@ if(WITH_TESTS) monitoring/statistics_test.cc options/options_settable_test.cc options/options_test.cc - table/block_based_filter_block_test.cc - table/block_test.cc + table/block_based/block_based_filter_block_test.cc + table/block_based/block_test.cc + table/block_based/data_block_hash_index_test.cc + table/block_based/full_filter_block_test.cc + table/block_based/partitioned_filter_block_test.cc table/cleanable_test.cc - table/cuckoo_table_builder_test.cc - table/cuckoo_table_reader_test.cc - table/data_block_hash_index_test.cc - table/full_filter_block_test.cc + table/cuckoo/cuckoo_table_builder_test.cc + table/cuckoo/cuckoo_table_reader_test.cc table/merger_test.cc table/sst_file_reader_test.cc table/table_test.cc diff --git a/Makefile b/Makefile index 16d5da0b16c..d41192ab2e0 100644 --- a/Makefile +++ b/Makefile @@ -1378,13 +1378,13 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -partitioned_filter_block_test: table/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1396,10 +1396,10 @@ cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) +data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1465,10 +1465,10 @@ rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS) rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS) $(AM_LINK) -cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) +cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) +cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/TARGETS b/TARGETS index c438aa3fb45..70d6e219413 100644 --- a/TARGETS +++ b/TARGETS @@ -173,36 +173,36 @@ cpp_library( "options/options_sanity_check.cc", "port/port_posix.cc", "port/stack_trace.cc", - "table/adaptive_table_factory.cc", - "table/block.cc", - "table/block_based_filter_block.cc", - "table/block_based_table_builder.cc", - "table/block_based_table_factory.cc", - "table/block_based_table_reader.cc", - "table/block_builder.cc", + "table/adaptive/adaptive_table_factory.cc", + "table/block_based/block.cc", + "table/block_based/block_based_filter_block.cc", + "table/block_based/block_based_table_builder.cc", + "table/block_based/block_based_table_factory.cc", + "table/block_based/block_based_table_reader.cc", + "table/block_based/block_builder.cc", + "table/block_based/block_prefix_index.cc", + "table/block_based/data_block_hash_index.cc", + "table/block_based/data_block_footer.cc", + "table/block_based/flush_block_policy.cc", + "table/block_based/full_filter_block.cc", + "table/block_based/index_builder.cc", + "table/block_based/partitioned_filter_block.cc", "table/block_fetcher.cc", - "table/block_prefix_index.cc", "table/bloom_block.cc", - "table/cuckoo_table_builder.cc", - "table/cuckoo_table_factory.cc", - "table/cuckoo_table_reader.cc", - "table/data_block_footer.cc", - "table/data_block_hash_index.cc", - "table/flush_block_policy.cc", + "table/cuckoo/cuckoo_table_builder.cc", + "table/cuckoo/cuckoo_table_factory.cc", + "table/cuckoo/cuckoo_table_reader.cc", "table/format.cc", - "table/full_filter_block.cc", "table/get_context.cc", - "table/index_builder.cc", "table/iterator.cc", "table/merging_iterator.cc", "table/meta_blocks.cc", - "table/partitioned_filter_block.cc", "table/persistent_cache_helper.cc", - "table/plain_table_builder.cc", - "table/plain_table_factory.cc", - "table/plain_table_index.cc", - "table/plain_table_key_coding.cc", - "table/plain_table_reader.cc", + "table/plain/plain_table_builder.cc", + "table/plain/plain_table_factory.cc", + "table/plain/plain_table_index.cc", + "table/plain/plain_table_key_coding.cc", + "table/plain/plain_table_reader.cc", "table/sst_file_reader.cc", "table/sst_file_writer.cc", "table/table_properties.cc", @@ -378,12 +378,12 @@ ROCKS_TESTS = [ ], [ "block_based_filter_block_test", - "table/block_based_filter_block_test.cc", + "table/block_based/block_based_filter_block_test.cc", "serial", ], [ "block_test", - "table/block_test.cc", + "table/block_based/block_test.cc", "serial", ], [ @@ -488,7 +488,7 @@ ROCKS_TESTS = [ ], [ "cuckoo_table_builder_test", - "table/cuckoo_table_builder_test.cc", + "table/cuckoo/cuckoo_table_builder_test.cc", "serial", ], [ @@ -498,12 +498,12 @@ ROCKS_TESTS = [ ], [ "cuckoo_table_reader_test", - "table/cuckoo_table_reader_test.cc", + "table/cuckoo/cuckoo_table_reader_test.cc", "serial", ], [ "data_block_hash_index_test", - "table/data_block_hash_index_test.cc", + "table/block_based/data_block_hash_index_test.cc", "serial", ], [ @@ -743,7 +743,7 @@ ROCKS_TESTS = [ ], [ "full_filter_block_test", - "table/full_filter_block_test.cc", + "table/block_based/full_filter_block_test.cc", "serial", ], [ @@ -873,7 +873,7 @@ ROCKS_TESTS = [ ], [ "partitioned_filter_block_test", - "table/partitioned_filter_block_test.cc", + "table/block_based/partitioned_filter_block_test.cc", "serial", ], [ diff --git a/db/builder.cc b/db/builder.cc index 2b97ce1d608..14160f64c75 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -29,7 +29,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/format.h" #include "table/internal_iterator.h" #include "util/file_reader_writer.h" diff --git a/db/column_family.cc b/db/column_family.cc index 325610b8844..84f521cd7b8 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -33,7 +33,7 @@ #include "memtable/hash_skiplist_rep.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "util/autovector.h" #include "util/compression.h" diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 9e5d46f877d..9e22e161f28 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -49,8 +49,8 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "util/coding.h" diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index 91441f5d76a..daf41386690 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -48,9 +48,9 @@ #include "rocksdb/thread_status.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/write_batch_with_index.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "util/hash.h" diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 379c33e4599..130821ff997 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -25,8 +25,9 @@ #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" +#include "file/filename.h" #include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index ecd6d71ca2e..f9efbc58503 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -8,8 +8,8 @@ #include "db/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "table/cuckoo_table_factory.h" -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include "table/meta_blocks.h" #include "util/string_util.h" #include "test_util/testharness.h" diff --git a/db/db_impl.cc b/db/db_impl.cc index 749bd3629a0..ec162bb961e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -77,8 +77,8 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/write_buffer_manager.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" #include "table/merging_iterator.h" #include "table/multiget_context.h" diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 0be85031ba3..db47d141655 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -18,7 +18,7 @@ #include "file/sst_file_manager_impl.h" #include "options/options_helper.h" #include "rocksdb/wal_filter.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/rate_limiter.h" #include "test_util/sync_point.h" diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index cc1af2e0ad8..e2b9f503ffb 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -15,7 +15,7 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" -#include "table/flush_block_policy.h" +#include "table/block_based/flush_block_policy.h" namespace rocksdb { diff --git a/db/db_test.cc b/db/db_test.cc index 66df2323de2..bf0babd1a3a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -54,9 +54,9 @@ #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "util/file_reader_writer.h" diff --git a/db/db_test_util.h b/db/db_test_util.h index 3bc107889b4..1882cde59dc 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once + #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif @@ -41,9 +42,9 @@ #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "test_util/mock_time_env.h" diff --git a/db/flush_job.cc b/db/flush_job.cc index 4930ecac7e9..c8729c66840 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -40,8 +40,8 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 57c7427e801..58332f30faf 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -23,7 +23,7 @@ #include "db/column_family.h" #include "db/db_impl.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/listener_test.cc b/db/listener_test.cc index 663116b7b8d..881534a1f1d 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -22,8 +22,9 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/block_based_table_factory.h" -#include "table/plain_table_factory.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" +#include "file/filename.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 7648ed85ff7..bfeb54243d9 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -25,9 +25,9 @@ #include "rocksdb/table.h" #include "table/bloom_block.h" #include "table/meta_blocks.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_key_coding.h" -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/plain/plain_table_reader.h" #include "table/table_builder.h" #include "util/hash.h" #include "util/logging.h" diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 6171b2938c2..0705cc032fe 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -12,11 +12,12 @@ #include "db/db_impl.h" #include "db/dbformat.h" #include "db/table_properties_collector.h" + +#include "table/meta_blocks.h" #include "options/cf_options.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" -#include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_builder.h" #include "util/coding.h" #include "util/file_reader_writer.h" diff --git a/db/version_set.cc b/db/version_set.cc index b9616f3730b..864fc975358 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -45,7 +45,7 @@ #include "table/merging_iterator.h" #include "table/meta_blocks.h" #include "table/multiget_context.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" #include "util/coding.h" diff --git a/options/options.cc b/options/options.cc index 057727e59fb..a5037ee78d3 100644 --- a/options/options.cc +++ b/options/options.cc @@ -31,7 +31,7 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/wal_filter.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/compression.h" namespace rocksdb { diff --git a/options/options_helper.cc b/options/options_helper.cc index 82e7a1fa13a..0b531a6ec5e 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -9,6 +9,8 @@ #include #include #include + +#include "table/plain/plain_table_factory.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" @@ -20,8 +22,7 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/utilities/object_registry.h" -#include "table/block_based_table_factory.h" -#include "table/plain_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/cast_util.h" #include "util/string_util.h" diff --git a/options/options_parser.h b/options/options_parser.h index 5aab3e7e9b6..b2a806f179f 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -12,7 +12,7 @@ #include "options/options_sanity_check.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" namespace rocksdb { diff --git a/src.mk b/src.mk index 100b3355e74..a0f4043bf76 100644 --- a/src.mk +++ b/src.mk @@ -97,36 +97,36 @@ LIB_SOURCES = \ options/options_sanity_check.cc \ port/port_posix.cc \ port/stack_trace.cc \ - table/adaptive_table_factory.cc \ - table/block.cc \ - table/block_based_filter_block.cc \ - table/block_based_table_builder.cc \ - table/block_based_table_factory.cc \ - table/block_based_table_reader.cc \ - table/block_builder.cc \ - table/block_fetcher.cc \ - table/block_prefix_index.cc \ - table/bloom_block.cc \ - table/cuckoo_table_builder.cc \ - table/cuckoo_table_factory.cc \ - table/cuckoo_table_reader.cc \ - table/data_block_hash_index.cc \ - table/data_block_footer.cc \ - table/flush_block_policy.cc \ + table/adaptive/adaptive_table_factory.cc \ + table/block_based/block.cc \ + table/block_based/block_based_filter_block.cc \ + table/block_based/block_based_table_builder.cc \ + table/block_based/block_based_table_factory.cc \ + table/block_based/block_based_table_reader.cc \ + table/block_based/block_builder.cc \ + table/block_based/block_prefix_index.cc \ + table/block_based/data_block_hash_index.cc \ + table/block_based/data_block_footer.cc \ + table/block_based/flush_block_policy.cc \ + table/block_based/full_filter_block.cc \ + table/block_based/index_builder.cc \ + table/block_based/partitioned_filter_block.cc \ + table/block_fetcher.cc \ + table/bloom_block.cc \ + table/cuckoo/cuckoo_table_builder.cc \ + table/cuckoo/cuckoo_table_factory.cc \ + table/cuckoo/cuckoo_table_reader.cc \ table/format.cc \ - table/full_filter_block.cc \ table/get_context.cc \ - table/index_builder.cc \ table/iterator.cc \ table/merging_iterator.cc \ table/meta_blocks.cc \ - table/partitioned_filter_block.cc \ table/persistent_cache_helper.cc \ - table/plain_table_builder.cc \ - table/plain_table_factory.cc \ - table/plain_table_index.cc \ - table/plain_table_key_coding.cc \ - table/plain_table_reader.cc \ + table/plain/plain_table_builder.cc \ + table/plain/plain_table_factory.cc \ + table/plain/plain_table_index.cc \ + table/plain/plain_table_key_coding.cc \ + table/plain/plain_table_reader.cc \ table/sst_file_reader.cc \ table/sst_file_writer.cc \ table/table_properties.cc \ @@ -319,7 +319,6 @@ MAIN_SOURCES = \ db/obsolete_files_test.cc \ db/options_settable_test.cc \ db/options_file_test.cc \ - db/partitioned_filter_block_test.cc \ db/perf_context_test.cc \ db/persistent_cache_test.cc \ db/plain_table_db_test.cc \ @@ -348,13 +347,14 @@ MAIN_SOURCES = \ monitoring/iostats_context_test.cc \ monitoring/statistics_test.cc \ options/options_test.cc \ - table/block_based_filter_block_test.cc \ - table/block_test.cc \ + table/block_based/block_based_filter_block_test.cc \ + table/block_based/block_test.cc \ + table/block_based/data_block_hash_index_test.cc \ + table/block_based/full_filter_block_test.cc \ + table/block_based/partitioned_filter_block_test.cc \ table/cleanable_test.cc \ - table/cuckoo_table_builder_test.cc \ - table/cuckoo_table_reader_test.cc \ - table/data_block_hash_index_test.cc \ - table/full_filter_block_test.cc \ + table/cuckoo/cuckoo_table_builder_test.cc \ + table/cuckoo/cuckoo_table_reader_test.cc \ table/merger_test.cc \ table/sst_file_reader_test.cc \ table/table_reader_bench.cc \ diff --git a/table/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc similarity index 98% rename from table/adaptive_table_factory.cc rename to table/adaptive/adaptive_table_factory.cc index d5dcbc5f585..0086368a9bb 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -4,7 +4,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/adaptive_table_factory.h" +#include "table/adaptive/adaptive_table_factory.h" #include "table/table_builder.h" #include "table/format.h" diff --git a/table/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h similarity index 100% rename from table/adaptive_table_factory.h rename to table/adaptive/adaptive_table_factory.h diff --git a/table/block.cc b/table/block_based/block.cc similarity index 99% rename from table/block.cc rename to table/block_based/block.cc index a6cc8d2705f..dfc4aa3c679 100644 --- a/table/block.cc +++ b/table/block_based/block.cc @@ -9,7 +9,7 @@ // // Decodes the blocks generated by block_builder.cc. -#include "table/block.h" +#include "table/block_based/block.h" #include #include #include @@ -19,8 +19,8 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" -#include "table/block_prefix_index.h" -#include "table/data_block_footer.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_footer.h" #include "table/format.h" #include "util/coding.h" #include "util/logging.h" diff --git a/table/block.h b/table/block_based/block.h similarity index 99% rename from table/block.h rename to table/block_based/block.h index 869d2f1f286..8bf6f535612 100644 --- a/table/block.h +++ b/table/block_based/block.h @@ -22,13 +22,13 @@ #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" -#include "format.h" +#include "table/format.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" -#include "table/block_prefix_index.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_hash_index.h" #include "table/internal_iterator.h" #include "util/random.h" #include "test_util/sync_point.h" diff --git a/table/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc similarity index 99% rename from table/block_based_filter_block.cc rename to table/block_based/block_based_filter_block.cc index 81087b243b7..fb366b5d316 100644 --- a/table/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_filter_block.h" +#include "table/block_based/block_based_filter_block.h" #include #include "db/dbformat.h" diff --git a/table/block_based_filter_block.h b/table/block_based/block_based_filter_block.h similarity index 99% rename from table/block_based_filter_block.h rename to table/block_based/block_based_filter_block.h index d1ff585462a..74a2285e1e9 100644 --- a/table/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -18,10 +18,11 @@ #include #include #include + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/filter_block.h" +#include "table/block_based/filter_block.h" #include "util/hash.h" namespace rocksdb { diff --git a/table/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc similarity index 99% rename from table/block_based_filter_block_test.cc rename to table/block_based/block_based_filter_block_test.cc index 2cb3abc27a6..8d074275ce6 100644 --- a/table/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_filter_block.h" +#include "table/block_based/block_based_filter_block.h" #include "rocksdb/filter_policy.h" #include "util/coding.h" diff --git a/table/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc similarity index 98% rename from table/block_based_table_builder.cc rename to table/block_based/block_based_table_builder.cc index 9a1742e5f3a..034c6b238fd 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include #include @@ -20,6 +20,7 @@ #include #include "db/dbformat.h" +#include "index_builder.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -29,14 +30,15 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_filter_block.h" -#include "table/block_based_table_factory.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" #include "table/format.h" -#include "table/full_filter_block.h" #include "table/table_builder.h" #include "util/coding.h" @@ -47,8 +49,6 @@ #include "util/string_util.h" #include "util/xxhash.h" -#include "table/index_builder.h" -#include "table/partitioned_filter_block.h" namespace rocksdb { diff --git a/table/block_based_table_builder.h b/table/block_based/block_based_table_builder.h similarity index 100% rename from table/block_based_table_builder.h rename to table/block_based/block_based_table_builder.h index a1ef3889112..0c580b445dd 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -14,11 +14,11 @@ #include #include +#include "table/meta_blocks.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/listener.h" #include "rocksdb/options.h" #include "rocksdb/status.h" -#include "table/meta_blocks.h" #include "table/table_builder.h" #include "util/compression.h" diff --git a/table/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc similarity index 99% rename from table/block_based_table_factory.cc rename to table/block_based/block_based_table_factory.cc index 790a2c99ecc..609679394ea 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_factory.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS @@ -24,8 +23,9 @@ #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/flush_block_policy.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/table/block_based_table_factory.h b/table/block_based/block_based_table_factory.h similarity index 100% rename from table/block_based_table_factory.h rename to table/block_based/block_based_table_factory.h diff --git a/table/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc similarity index 99% rename from table/block_based_table_reader.cc rename to table/block_based/block_based_table_reader.cc index 82f96492662..725ecdb4e3f 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader.h" #include #include @@ -15,6 +15,8 @@ #include #include +#include "table/block_fetcher.h" +#include "table/meta_blocks.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" @@ -27,20 +29,17 @@ #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" - -#include "table/block.h" -#include "table/block_based_filter_block.h" -#include "table/block_based_table_factory.h" -#include "table/block_fetcher.h" -#include "table/block_prefix_index.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" #include "table/format.h" -#include "table/full_filter_block.h" #include "table/get_context.h" #include "table/internal_iterator.h" -#include "table/meta_blocks.h" #include "table/multiget_context.h" -#include "table/partitioned_filter_block.h" #include "table/persistent_cache_helper.h" #include "table/sst_file_writer_collectors.h" #include "table/two_level_iterator.h" diff --git a/table/block_based_table_reader.h b/table/block_based/block_based_table_reader.h similarity index 99% rename from table/block_based_table_reader.h rename to table/block_based/block_based_table_reader.h index 54ce34d617b..6d265ba755b 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -23,10 +23,10 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" -#include "table/cachable_entry.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" #include "table/format.h" #include "table/get_context.h" #include "table/multiget_context.h" diff --git a/table/block_builder.cc b/table/block_based/block_builder.cc similarity index 98% rename from table/block_builder.cc rename to table/block_based/block_builder.cc index c14b4f6d3ee..a6a240c8e0a 100644 --- a/table/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -31,13 +31,13 @@ // num_restarts: uint32 // restarts[i] contains the offset within the block of the ith restart point. -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include #include #include "db/dbformat.h" #include "rocksdb/comparator.h" -#include "table/data_block_footer.h" +#include "table/block_based/data_block_footer.h" #include "util/coding.h" namespace rocksdb { diff --git a/table/block_builder.h b/table/block_based/block_builder.h similarity index 98% rename from table/block_builder.h rename to table/block_based/block_builder.h index 0576279f501..153e57569a2 100644 --- a/table/block_builder.h +++ b/table/block_based/block_builder.h @@ -13,7 +13,7 @@ #include #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/data_block_hash_index.h" namespace rocksdb { diff --git a/table/block_prefix_index.cc b/table/block_based/block_prefix_index.cc similarity index 99% rename from table/block_prefix_index.cc rename to table/block_based/block_prefix_index.cc index 67c749d4c3a..0050f1f1e58 100644 --- a/table/block_prefix_index.cc +++ b/table/block_based/block_prefix_index.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/block_prefix_index.h" +#include "table/block_based/block_prefix_index.h" #include diff --git a/table/block_prefix_index.h b/table/block_based/block_prefix_index.h similarity index 100% rename from table/block_prefix_index.h rename to table/block_based/block_prefix_index.h diff --git a/table/block_test.cc b/table/block_based/block_test.cc similarity index 99% rename from table/block_test.cc rename to table/block_based/block_test.cc index d359b4e59ca..a4c5678881e 100644 --- a/table/block_test.cc +++ b/table/block_based/block_test.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // + #include #include #include @@ -19,8 +20,8 @@ #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_builder.h" +#include "table/block_based/block.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include "util/random.h" #include "test_util/testharness.h" diff --git a/table/cachable_entry.h b/table/block_based/cachable_entry.h similarity index 100% rename from table/cachable_entry.h rename to table/block_based/cachable_entry.h diff --git a/table/data_block_footer.cc b/table/block_based/data_block_footer.cc similarity index 97% rename from table/data_block_footer.cc rename to table/block_based/data_block_footer.cc index cb9e1438152..2cf31b4c5ef 100644 --- a/table/data_block_footer.cc +++ b/table/block_based/data_block_footer.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "data_block_footer.h" +#include "table/block_based/data_block_footer.h" #include "rocksdb/table.h" diff --git a/table/data_block_footer.h b/table/block_based/data_block_footer.h similarity index 100% rename from table/data_block_footer.h rename to table/block_based/data_block_footer.h diff --git a/table/data_block_hash_index.cc b/table/block_based/data_block_hash_index.cc similarity index 98% rename from table/data_block_hash_index.cc rename to table/block_based/data_block_hash_index.cc index adb1d7b8c26..7737a9491ee 100644 --- a/table/data_block_hash_index.cc +++ b/table/block_based/data_block_hash_index.cc @@ -6,7 +6,7 @@ #include #include "rocksdb/slice.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/data_block_hash_index.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/data_block_hash_index.h b/table/block_based/data_block_hash_index.h similarity index 100% rename from table/data_block_hash_index.h rename to table/block_based/data_block_hash_index.h diff --git a/table/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc similarity index 99% rename from table/data_block_hash_index_test.cc rename to table/block_based/data_block_hash_index_test.cc index 0511b257aa3..204e92ecbe3 100644 --- a/table/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -3,16 +3,17 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). + #include #include #include #include "db/table_properties_collector.h" #include "rocksdb/slice.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/data_block_hash_index.h" #include "table/get_context.h" #include "table/table_builder.h" #include "test_util/testharness.h" diff --git a/table/filter_block.h b/table/block_based/filter_block.h similarity index 99% rename from table/filter_block.h rename to table/block_based/filter_block.h index 8abb88e5f4f..8b01214c7eb 100644 --- a/table/filter_block.h +++ b/table/block_based/filter_block.h @@ -24,7 +24,7 @@ #include #include #include "db/dbformat.h" -#include "format.h" +#include "table/format.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" diff --git a/table/flush_block_policy.cc b/table/block_based/flush_block_policy.cc similarity index 98% rename from table/flush_block_policy.cc rename to table/block_based/flush_block_policy.cc index 1b1675828da..31576848c07 100644 --- a/table/flush_block_policy.cc +++ b/table/block_based/flush_block_policy.cc @@ -6,7 +6,7 @@ #include "rocksdb/flush_block_policy.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include diff --git a/table/flush_block_policy.h b/table/block_based/flush_block_policy.h similarity index 100% rename from table/flush_block_policy.h rename to table/block_based/flush_block_policy.h diff --git a/table/full_filter_block.cc b/table/block_based/full_filter_block.cc similarity index 99% rename from table/full_filter_block.cc rename to table/block_based/full_filter_block.cc index 9015e96d2ea..56dc74c6710 100644 --- a/table/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/full_filter_block.h" +#include "table/block_based/full_filter_block.h" #ifdef ROCKSDB_MALLOC_USABLE_SIZE #ifdef OS_FREEBSD diff --git a/table/full_filter_block.h b/table/block_based/full_filter_block.h similarity index 99% rename from table/full_filter_block.h rename to table/block_based/full_filter_block.h index f97952a7ced..3e5d82733b0 100644 --- a/table/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -10,12 +10,13 @@ #include #include #include + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "db/dbformat.h" #include "util/hash.h" -#include "table/filter_block.h" +#include "table/block_based/filter_block.h" namespace rocksdb { diff --git a/table/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc similarity index 99% rename from table/full_filter_block_test.cc rename to table/block_based/full_filter_block_test.cc index 0ef5c5a970c..8b99f54b03f 100644 --- a/table/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/full_filter_block.h" +#include "table/block_based/full_filter_block.h" #include "rocksdb/filter_policy.h" #include "table/full_filter_bits_builder.h" diff --git a/table/index_builder.cc b/table/block_based/index_builder.cc similarity index 98% rename from table/index_builder.cc rename to table/block_based/index_builder.cc index 63cb80598fe..f11ecd4f4bc 100644 --- a/table/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -7,7 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/index_builder.h" +#include "table/block_based/index_builder.h" + #include #include @@ -17,7 +18,7 @@ #include "rocksdb/comparator.h" #include "rocksdb/flush_block_policy.h" #include "table/format.h" -#include "table/partitioned_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace rocksdb { diff --git a/table/index_builder.h b/table/block_based/index_builder.h similarity index 99% rename from table/index_builder.h rename to table/block_based/index_builder.h index 2f349fc5471..7e6a4bb0776 100644 --- a/table/index_builder.h +++ b/table/block_based/index_builder.h @@ -17,8 +17,8 @@ #include #include "rocksdb/comparator.h" -#include "table/block_based_table_factory.h" -#include "table/block_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" #include "table/format.h" namespace rocksdb { diff --git a/table/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc similarity index 98% rename from table/partitioned_filter_block.cc rename to table/block_based/partitioned_filter_block.cc index 3ccc7946393..315e63306f1 100644 --- a/table/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/partitioned_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" #ifdef ROCKSDB_MALLOC_USABLE_SIZE #ifdef OS_FREEBSD @@ -17,8 +17,8 @@ #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "rocksdb/filter_policy.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" namespace rocksdb { diff --git a/table/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h similarity index 95% rename from table/partitioned_filter_block.h rename to table/block_based/partitioned_filter_block.h index 2563dd2bf35..735f1c6e3eb 100644 --- a/table/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -9,15 +9,14 @@ #include #include #include "db/dbformat.h" +#include "index_builder.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" - -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/cachable_entry.h" -#include "table/full_filter_block.h" -#include "table/index_builder.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/cachable_entry.h" #include "util/autovector.h" namespace rocksdb { diff --git a/table/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc similarity index 99% rename from table/partitioned_filter_block_test.cc rename to table/block_based/partitioned_filter_block_test.cc index 4bdc2fd36f1..2bcafa9771a 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -3,13 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). + #include #include "rocksdb/filter_policy.h" #include "table/full_filter_bits_builder.h" -#include "table/index_builder.h" -#include "table/partitioned_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" + +#include "index_builder.h" #include "util/coding.h" #include "util/hash.h" #include "util/logging.h" @@ -96,7 +98,7 @@ class PartitionedFilterBlockTest partition_size * table_options_.block_size_deviation / 100; } - int last_offset = 10; + uint64_t last_offset = 10; BlockHandle Write(const Slice& slice) { BlockHandle bh(last_offset + 1, slice.size()); slices[bh.offset()] = slice; diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 1f209210c13..6c663702900 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -15,8 +15,8 @@ #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "table/persistent_cache_helper.h" #include "util/coding.h" diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 0dcdfc76125..56b74b50427 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "table/block.h" +#include "table/block_based/block.h" #include "table/format.h" #include "util/memory_allocator.h" diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc similarity index 99% rename from table/cuckoo_table_builder.cc rename to table/cuckoo/cuckoo_table_builder.cc index f590e6ad405..f1a64cb6a67 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_builder.h" #include #include @@ -15,8 +15,8 @@ #include "db/dbformat.h" #include "rocksdb/env.h" #include "rocksdb/table.h" -#include "table/block_builder.h" -#include "table/cuckoo_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/cuckoo/cuckoo_table_factory.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/autovector.h" diff --git a/table/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h similarity index 100% rename from table/cuckoo_table_builder.h rename to table/cuckoo/cuckoo_table_builder.h diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc similarity index 99% rename from table/cuckoo_table_builder_test.cc rename to table/cuckoo/cuckoo_table_builder_test.cc index eeba9480592..1467e2a8d1b 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -11,7 +11,7 @@ #include #include "table/meta_blocks.h" -#include "table/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_builder.h" #include "util/file_reader_writer.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc similarity index 94% rename from table/cuckoo_table_factory.cc rename to table/cuckoo/cuckoo_table_factory.cc index 74d18d51213..4ca29f364cf 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo/cuckoo_table_factory.cc @@ -4,11 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_factory.h" #include "db/dbformat.h" -#include "table/cuckoo_table_builder.h" -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_reader.h" namespace rocksdb { diff --git a/table/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h similarity index 100% rename from table/cuckoo_table_factory.h rename to table/cuckoo/cuckoo_table_factory.h diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc similarity index 99% rename from table/cuckoo_table_reader.cc rename to table/cuckoo/cuckoo_table_reader.cc index f4df2467fdb..72885be940e 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include #include @@ -19,7 +19,7 @@ #include "rocksdb/table.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" -#include "table/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_factory.h" #include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" diff --git a/table/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h similarity index 100% rename from table/cuckoo_table_reader.h rename to table/cuckoo/cuckoo_table_reader.h diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc similarity index 99% rename from table/cuckoo_table_reader_test.cc rename to table/cuckoo/cuckoo_table_reader_test.cc index 6d596f6e115..71e231336c5 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -22,9 +22,9 @@ int main() { #include #include -#include "table/cuckoo_table_builder.h" -#include "table/cuckoo_table_factory.h" -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include "table/get_context.h" #include "table/meta_blocks.h" #include "util/arena.h" diff --git a/table/format.cc b/table/format.cc index 476db85f731..1adcce6f3f4 100644 --- a/table/format.cc +++ b/table/format.cc @@ -12,12 +12,12 @@ #include #include +#include "block_fetcher.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/block_fetcher.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/persistent_cache_helper.h" #include "util/coding.h" #include "util/compression.h" diff --git a/table/get_context.h b/table/get_context.h index 856e01a9502..8df343b3653 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -11,7 +11,7 @@ #include "rocksdb/env.h" #include "rocksdb/statistics.h" #include "rocksdb/types.h" -#include "table/block.h" +#include "table/block_based/block.h" namespace rocksdb { class MergeContext; diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 98e05a4d032..9d56c5b9c29 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -7,11 +7,11 @@ #include #include +#include "block_fetcher.h" #include "db/table_properties_collector.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/block.h" -#include "table/block_fetcher.h" +#include "table/block_based/block.h" #include "table/format.h" #include "table/internal_iterator.h" #include "table/persistent_cache_helper.h" diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 6efd1225e19..5224c54714d 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -15,7 +15,7 @@ #include "rocksdb/memory_allocator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include "util/kv_map.h" diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc index 4e90697a6e5..8431f13db37 100644 --- a/table/persistent_cache_helper.cc +++ b/table/persistent_cache_helper.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/persistent_cache_helper.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" namespace rocksdb { diff --git a/table/plain_table_builder.cc b/table/plain/plain_table_builder.cc similarity index 98% rename from table/plain_table_builder.cc rename to table/plain/plain_table_builder.cc index 453b6c768b5..6160d7afd9e 100644 --- a/table/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/plain_table_builder.h" +#include "table/plain/plain_table_builder.h" #include @@ -17,11 +17,11 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "db/dbformat.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "table/bloom_block.h" -#include "table/plain_table_index.h" +#include "table/plain/plain_table_index.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/coding.h" diff --git a/table/plain_table_builder.h b/table/plain/plain_table_builder.h similarity index 98% rename from table/plain_table_builder.h rename to table/plain/plain_table_builder.h index 9a5b44b9c2c..0a29098d657 100644 --- a/table/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once + #ifndef ROCKSDB_LITE #include #include @@ -13,8 +14,8 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/bloom_block.h" -#include "table/plain_table_index.h" -#include "table/plain_table_key_coding.h" +#include "table/plain/plain_table_index.h" +#include "table/plain/plain_table_key_coding.h" #include "table/table_builder.h" namespace rocksdb { diff --git a/table/plain_table_factory.cc b/table/plain/plain_table_factory.cc similarity index 98% rename from table/plain_table_factory.cc rename to table/plain/plain_table_factory.cc index 0dccec55242..6c6905dab1f 100644 --- a/table/plain_table_factory.cc +++ b/table/plain/plain_table_factory.cc @@ -4,7 +4,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include #include @@ -12,8 +12,8 @@ #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/convenience.h" -#include "table/plain_table_builder.h" -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_builder.h" +#include "table/plain/plain_table_reader.h" #include "util/string_util.h" namespace rocksdb { diff --git a/table/plain_table_factory.h b/table/plain/plain_table_factory.h similarity index 100% rename from table/plain_table_factory.h rename to table/plain/plain_table_factory.h diff --git a/table/plain_table_index.cc b/table/plain/plain_table_index.cc similarity index 99% rename from table/plain_table_index.cc rename to table/plain/plain_table_index.cc index 43740923974..196be22cfe9 100644 --- a/table/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -11,7 +11,7 @@ #include -#include "table/plain_table_index.h" +#include "table/plain/plain_table_index.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/plain_table_index.h b/table/plain/plain_table_index.h similarity index 100% rename from table/plain_table_index.h rename to table/plain/plain_table_index.h diff --git a/table/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc similarity index 99% rename from table/plain_table_key_coding.cc rename to table/plain/plain_table_key_coding.cc index 6f5ee9b4ad2..9c4b614b549 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -4,13 +4,13 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/plain_table_key_coding.h" +#include "table/plain/plain_table_key_coding.h" #include #include #include "db/dbformat.h" -#include "table/plain_table_reader.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_reader.h" +#include "table/plain/plain_table_factory.h" #include "util/file_reader_writer.h" namespace rocksdb { diff --git a/table/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h similarity index 99% rename from table/plain_table_key_coding.h rename to table/plain/plain_table_key_coding.h index 93f8f7af4b5..26af3f6d8bd 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain/plain_table_key_coding.h @@ -4,12 +4,13 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once + #ifndef ROCKSDB_LITE #include #include "rocksdb/slice.h" #include "db/dbformat.h" -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_reader.h" // The file contains three helper classes of PlainTable format, // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. diff --git a/table/plain_table_reader.cc b/table/plain/plain_table_reader.cc similarity index 99% rename from table/plain_table_reader.cc rename to table/plain/plain_table_reader.cc index f33afdefc38..b4aad55876b 100644 --- a/table/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_reader.h" #include #include @@ -19,15 +19,15 @@ #include "rocksdb/options.h" #include "rocksdb/statistics.h" -#include "table/block.h" +#include "table/block_based/block.h" #include "table/bloom_block.h" -#include "table/filter_block.h" +#include "table/block_based/filter_block.h" #include "table/format.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" #include "table/two_level_iterator.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_key_coding.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" #include "table/get_context.h" #include "monitoring/histogram.h" diff --git a/table/plain_table_reader.h b/table/plain/plain_table_reader.h similarity index 98% rename from table/plain_table_reader.h rename to table/plain/plain_table_reader.h index 12b22aaf12e..ec6e6a7febb 100644 --- a/table/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -19,8 +19,8 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/table_reader.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_index.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" #include "util/arena.h" #include "util/dynamic_bloom.h" #include "util/file_reader_writer.h" diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 71b395fd6be..69993492d48 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -6,9 +6,10 @@ #include "rocksdb/sst_file_writer.h" #include + #include "db/dbformat.h" #include "rocksdb/table.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" #include "util/file_reader_writer.h" #include "test_util/sync_point.h" diff --git a/table/table_properties.cc b/table/table_properties.cc index 8cfa2619591..6e481798c35 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -4,10 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/table_properties.h" + #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "table/block.h" +#include "table/block_based/block.h" #include "table/internal_iterator.h" #include "table/table_properties_internal.h" #include "util/string_util.h" diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 6b05d385e06..f2ae016c10d 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -17,10 +17,10 @@ int main() { #include "rocksdb/db.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" #include "table/internal_iterator.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_builder.h" #include "util/file_reader_writer.h" #include "util/gflags_compat.h" diff --git a/table/table_test.cc b/table/table_test.cc index aeb66f8d35f..372443b536a 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -16,11 +16,13 @@ #include #include +#include "block_fetcher.h" #include "cache/lru_cache.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "memtable/stl_wrappers.h" +#include "meta_blocks.h" #include "monitoring/statistics.h" #include "port/port.h" #include "rocksdb/cache.h" @@ -32,18 +34,16 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" #include "rocksdb/write_buffer_manager.h" -#include "table/block.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_factory.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/block_fetcher.h" -#include "table/flush_block_policy.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/flush_block_policy.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" -#include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "util/compression.h" diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index a8f617dee29..ba883763e9f 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -11,7 +11,7 @@ #include "db/pinned_iterators_manager.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/block.h" +#include "table/block_based/block.h" #include "table/format.h" #include "util/arena.h" diff --git a/test_util/testutil.h b/test_util/testutil.h index 2aab3df72c4..7890ce5f511 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -20,9 +20,9 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/internal_iterator.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "util/mutexlock.h" #include "util/random.h" diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index a2c226b926c..ea27f3c8d45 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -13,7 +13,7 @@ #include "rocksdb/sst_dump_tool.h" #include "rocksdb/filter_policy.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/table_builder.h" #include "util/file_reader_writer.h" #include "test_util/testharness.h" diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 5cbbfc38542..aa051da01f5 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -1,3 +1,4 @@ + // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License @@ -28,13 +29,13 @@ #include "rocksdb/status.h" #include "rocksdb/table_properties.h" #include "rocksdb/utilities/ldb_cmd.h" -#include "table/block.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_factory.h" -#include "table/block_builder.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "util/compression.h" #include "util/random.h" diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index a0186925243..03057afbc78 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -41,7 +41,7 @@ #include "rocksdb/utilities/ldb_cmd.h" #include "rocksdb/write_batch.h" #include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "tools/trace_analyzer_tool.h" #include "util/coding.h" diff --git a/util/bloom.cc b/util/bloom.cc index 1da4f2aa428..bedf4a65839 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -10,9 +10,9 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" -#include "table/block_based_filter_block.h" +#include "table/block_based/block_based_filter_block.h" #include "table/full_filter_bits_builder.h" -#include "table/full_filter_block.h" +#include "table/block_based/full_filter_block.h" #include "util/coding.h" #include "util/hash.h" diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 54eb3f2dbb5..a93169c30cd 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1,3 +1,4 @@ + // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License @@ -22,9 +23,9 @@ #include "rocksdb/iterator.h" #include "rocksdb/utilities/stackable_db.h" #include "rocksdb/utilities/transaction.h" -#include "table/block.h" -#include "table/block_based_table_builder.h" -#include "table/block_builder.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_builder.h" #include "table/meta_blocks.h" #include "util/cast_util.h" #include "util/crc32c.h" diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index c3ff640816e..224f7886bf1 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -10,7 +10,7 @@ #include "rocksdb/table.h" #include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/stackable_db.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc index 64d75c7a518..8bc795455cd 100644 --- a/utilities/persistent_cache/persistent_cache_bench.cc +++ b/utilities/persistent_cache/persistent_cache_bench.cc @@ -23,7 +23,7 @@ int main() { fprintf(stderr, "Please install gflags to run tools\n"); } #include "monitoring/histogram.h" #include "port/port.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "util/gflags_compat.h" #include "util/mutexlock.h" #include "util/stop_watch.h" diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h index 33cda4ea72d..6d15d13b69b 100644 --- a/utilities/persistent_cache/persistent_cache_test.h +++ b/utilities/persistent_cache/persistent_cache_test.h @@ -20,7 +20,7 @@ #include "db/db_test_util.h" #include "rocksdb/cache.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "port/port.h" #include "util/arena.h" #include "test_util/testharness.h" From 1b59a490ef8d8da78c826b379167207dfa682b4c Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 30 May 2019 16:07:57 -0700 Subject: [PATCH 086/572] Fix flaky DBTest2.PresetCompressionDict test (#5378) Summary: Fix flaky DBTest2.PresetCompressionDict test. This PR fixes two issues with the test: 1. Replaces `GetSstFiles` with `TotalSize`, which is based on `DB::GetColumnFamilyMetaData` so that only the size of the live SST files is taken into consideration when computing the total size of all sst files. Earlier, with `GetSstFiles`, even obsolete files were getting picked up. 1. In ZSTD compression, it is sometimes possible that using a trained dictionary is not better than using an untrained one. Using a trained dictionary performs well in 99% of the cases, but still in the remaining ~1% of the cases (out of 10000 runs) using an untrained dictionary gets better compression results. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5378 Differential Revision: D15559100 Pulled By: sagar0 fbshipit-source-id: c35adbf13871f520a2cec48f8bad9ff27ff7a0b4 --- db/db_test2.cc | 58 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index d93beb4477f..109a7a377bf 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -1036,8 +1036,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { ASSERT_TRUE(index == keys_cf.size()); } -// Temporarily disable it because the test is flaky. -TEST_F(DBTest2, DISABLED_PresetCompressionDict) { +TEST_F(DBTest2, PresetCompressionDict) { // Verifies that compression ratio improves when dictionary is enabled, and // improves even further when the dictionary is trained by ZSTD. const size_t kBlockSizeBytes = 4 << 10; @@ -1046,7 +1045,8 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { const int kNumL0Files = 5; Options options; - options.env = CurrentOptions().env; // Make sure to use any custom env that the test is configured with. + // Make sure to use any custom env that the test is configured with. + options.env = CurrentOptions().env; options.allow_concurrent_memtable_write = false; options.arena_block_size = kBlockSizeBytes; options.create_if_missing = true; @@ -1072,10 +1072,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { compression_types.push_back(kZSTD); } + enum DictionaryTypes : int { + kWithoutDict, + kWithDict, + kWithZSTDTrainedDict, + kDictEnd, + }; + for (auto compression_type : compression_types) { options.compression = compression_type; - size_t prev_out_bytes; - for (int i = 0; i < 3; ++i) { + size_t bytes_without_dict = 0; + size_t bytes_with_dict = 0; + size_t bytes_with_zstd_trained_dict = 0; + for (int i = kWithoutDict; i < kDictEnd; i++) { // First iteration: compress without preset dictionary // Second iteration: compress with preset dictionary // Third iteration (zstd only): compress with zstd-trained dictionary @@ -1085,19 +1094,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { // the non-first iterations, verify the data we get out is the same data // we put in. switch (i) { - case 0: + case kWithoutDict: options.compression_opts.max_dict_bytes = 0; options.compression_opts.zstd_max_train_bytes = 0; break; - case 1: - options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes; + case kWithDict: + options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = 0; break; - case 2: + case kWithZSTDTrainedDict: if (compression_type != kZSTD) { continue; } - options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes; + options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = kL0FileBytes; break; default: @@ -1129,23 +1138,32 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); - size_t out_bytes = 0; - std::vector files; - GetSstFiles(env_, dbname_, &files); - for (const auto& file : files) { - uint64_t curr_bytes; - env_->GetFileSize(dbname_ + "/" + file, &curr_bytes); - out_bytes += static_cast(curr_bytes); + // Get the live sst files size + size_t total_sst_bytes = TotalSize(1); + if (i == kWithoutDict) { + bytes_without_dict = total_sst_bytes; + } else if (i == kWithDict) { + bytes_with_dict = total_sst_bytes; + } else if (i == kWithZSTDTrainedDict) { + bytes_with_zstd_trained_dict = total_sst_bytes; } for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); j++) { ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); } - if (i) { - ASSERT_GT(prev_out_bytes, out_bytes); + if (i == kWithDict) { + ASSERT_GT(bytes_without_dict, bytes_with_dict); + } else if (i == kWithZSTDTrainedDict) { + // In zstd compression, it is sometimes possible that using a trained + // dictionary does not get as good a compression ratio as without + // training. + // But using a dictionary (with or without training) should always get + // better compression ratio than not using one. + ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || + bytes_without_dict > bytes_with_zstd_trained_dict); } - prev_out_bytes = out_bytes; + DestroyAndReopen(options); } } From f1302ebab8c39ba441a33e73b8e37d75d53efa22 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 30 May 2019 16:09:45 -0700 Subject: [PATCH 087/572] Add class-level comments to version-related classes (#5348) Summary: As title. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5348 Differential Revision: D15564595 Pulled By: riversand963 fbshipit-source-id: dd45aa86a70e0343c2e9ef702fad165163f548e6 --- db/version_set.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/db/version_set.h b/db/version_set.h index 28ad0c2c234..776e08e448c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -91,6 +91,9 @@ extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena); +// Information of the storage associated with each Version, including number of +// levels of LSM tree, files information at each level, files marked for +// compaction, etc. class VersionStorageInfo { public: VersionStorageInfo(const InternalKeyComparator* internal_comparator, @@ -537,6 +540,8 @@ class VersionStorageInfo { }; using MultiGetRange = MultiGetContext::Range; +// A column family's version consists of the SST files owned by the column +// family at a certain point in time. class Version { public: // Append to *iters a sequence of iterators that will @@ -747,6 +752,9 @@ struct ObsoleteFileInfo { class BaseReferencedVersionBuilder; +// VersionSet is the collection of versions of all the column families of the +// database. Each database owns one VersionSet. A VersionSet has access to all +// column families via ColumnFamilySet, i.e. set of the column families. class VersionSet { public: VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, @@ -1103,6 +1111,10 @@ class VersionSet { VersionEdit* edit, InstrumentedMutex* mu); }; +// ReactiveVersionSet represents a collection of versions of the column +// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, +// need to replay the MANIFEST (description log in older terms) in order to +// reconstruct and install versions. class ReactiveVersionSet : public VersionSet { public: ReactiveVersionSet(const std::string& dbname, From 8843129ecef255a70f186e095063b4e79b2b0c73 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 30 May 2019 17:39:43 -0700 Subject: [PATCH 088/572] Move some memory related files from util/ to memory/ (#5382) Summary: Move arena, allocator, and memory tools under util to a separate memory/ directory. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5382 Differential Revision: D15564655 Pulled By: siying fbshipit-source-id: 9cd6b5d0d3d52b39606e19221fa154596e5852a5 --- CMakeLists.txt | 8 +- Makefile | 2 +- TARGETS | 10 +- cache/cache_test.cc | 2 +- db/builder.cc | 2 +- db/column_family_test.cc | 4 +- db/compact_files_test.cc | 2 +- db/compaction.cc | 2 +- db/compaction.h | 2 +- db/compaction_iterator_test.cc | 2 +- db/compaction_job.cc | 2 +- db/compaction_job_stats_test.cc | 6 +- db/compaction_job_test.cc | 4 +- db/compaction_picker.cc | 2 +- db/compaction_picker_test.cc | 4 +- db/compaction_picker_universal.cc | 2 +- db/comparator_db_test.cc | 4 +- db/corruption_test.cc | 3 +- db/cuckoo_table_db_test.cc | 2 +- db/db_compaction_test.cc | 2 +- db/db_filesnapshot.cc | 2 +- db/db_impl.cc | 2 +- db/db_impl_compaction_flush.cc | 2 +- db/db_impl_open.cc | 2 +- db/db_iter.cc | 2 +- db/db_iter.h | 2 +- db/db_iter_stress_test.cc | 2 +- db/db_iter_test.cc | 2 +- db/db_options_test.cc | 2 +- db/db_test.cc | 6 +- db/db_test_util.h | 4 +- db/db_write_test.cc | 2 +- db/dbformat_test.cc | 2 +- db/deletefile_test.cc | 2 +- db/external_sst_file_ingestion_job.cc | 2 +- db/fault_injection_test.cc | 4 +- db/file_indexer.h | 2 +- db/file_indexer_test.cc | 2 +- db/filename_test.cc | 2 +- db/flush_job.cc | 2 +- db/flush_job_test.cc | 4 +- db/forward_iterator.cc | 2 +- db/forward_iterator.h | 4 +- db/forward_iterator_bench.cc | 2 +- db/listener_test.cc | 7 +- db/log_test.cc | 4 +- db/manual_compaction_test.cc | 4 +- db/memtable.cc | 4 +- db/memtable.h | 4 +- db/memtable_list.cc | 2 +- db/memtable_list_test.cc | 2 +- db/merge_helper_test.cc | 2 +- db/merge_test.cc | 8 +- db/obsolete_files_test.cc | 2 +- db/perf_context_test.cc | 2 +- db/plain_table_db_test.cc | 4 +- db/prefix_test.cc | 2 +- db/range_del_aggregator_bench.cc | 2 +- db/table_cache.cc | 2 +- db/table_properties_collector_test.cc | 6 +- db/version_builder_test.cc | 4 +- db/version_edit.cc | 2 +- db/version_edit.h | 6 +- db/version_edit_test.cc | 2 +- db/version_set.cc | 2 +- db/version_set_test.cc | 4 +- db/wal_manager.cc | 2 +- db/wal_manager_test.cc | 4 +- db/write_batch_test.cc | 2 +- db/write_callback_test.cc | 4 +- db/write_thread.cc | 2 +- env/env.cc | 2 +- env/env_posix.cc | 2 +- env/env_test.cc | 6 +- env/io_posix.cc | 2 +- file/delete_scheduler.cc | 2 +- file/delete_scheduler_test.cc | 2 +- file/filename.cc | 2 +- file/sst_file_manager_impl.cc | 2 +- java/rocksjni/write_batch_test.cc | 2 +- {util => memory}/allocator.h | 0 {util => memory}/arena.cc | 4 +- {util => memory}/arena.h | 8 +- {util => memory}/arena_test.cc | 4 +- {util => memory}/concurrent_arena.cc | 2 +- {util => memory}/concurrent_arena.h | 4 +- {util => memory}/jemalloc_nodump_allocator.cc | 2 +- {util => memory}/jemalloc_nodump_allocator.h | 0 {util => memory}/memory_allocator.h | 0 {util => memory}/memory_usage.h | 0 memtable/alloc_tracker.cc | 4 +- memtable/hash_linklist_rep.cc | 2 +- memtable/hash_skiplist_rep.cc | 8 +- memtable/inlineskiplist.h | 2 +- memtable/inlineskiplist_test.cc | 4 +- memtable/memtablerep_bench.cc | 4 +- memtable/skiplist.h | 4 +- memtable/skiplist_test.cc | 4 +- memtable/skiplistrep.cc | 4 +- memtable/vectorrep.cc | 2 +- options/options_helper.cc | 2 +- options/options_parser.cc | 2 +- options/options_test.cc | 4 +- port/win/env_default.cc | 2 +- port/win/io_win.cc | 2 +- src.mk | 8 +- table/block_based/block.h | 4 +- .../block_based_filter_block_test.cc | 4 +- .../block_based/block_based_table_builder.cc | 5 +- table/block_based/block_based_table_builder.h | 2 +- .../block_based/block_based_table_factory.cc | 2 +- table/block_based/block_based_table_reader.cc | 178 ++++++++---------- table/block_based/block_based_table_reader.h | 24 +-- table/block_based/block_prefix_index.cc | 2 +- table/block_based/block_test.cc | 2 +- .../block_based/data_block_hash_index_test.cc | 1 - table/block_based/filter_block.h | 2 +- table/block_based/full_filter_block.h | 4 +- table/block_based/full_filter_block_test.cc | 4 +- table/block_based/index_builder.cc | 2 +- table/block_based/partitioned_filter_block.h | 2 +- .../partitioned_filter_block_test.cc | 7 +- table/block_fetcher.cc | 2 +- table/block_fetcher.h | 2 +- table/cuckoo/cuckoo_table_builder_test.cc | 4 +- table/cuckoo/cuckoo_table_reader.cc | 6 +- table/cuckoo/cuckoo_table_reader_test.cc | 6 +- table/format.cc | 2 +- table/format.h | 2 +- table/get_context.h | 2 +- table/iterator.cc | 2 +- table/merging_iterator.cc | 4 +- table/meta_blocks.cc | 2 +- table/mock_table.h | 4 +- table/plain/plain_table_builder.cc | 6 +- table/plain/plain_table_index.h | 2 +- table/plain/plain_table_key_coding.cc | 2 +- table/plain/plain_table_key_coding.h | 2 +- table/plain/plain_table_reader.cc | 8 +- table/plain/plain_table_reader.h | 4 +- table/sst_file_writer.cc | 2 +- table/table_reader_bench.cc | 4 +- table/table_test.cc | 6 +- table/two_level_iterator.cc | 2 +- tools/db_bench_tool.cc | 4 +- tools/db_bench_tool_test.cc | 2 +- tools/db_repl_stress.cc | 2 +- tools/db_stress.cc | 9 +- tools/reduce_levels_test.cc | 4 +- tools/sst_dump_test.cc | 2 +- tools/trace_analyzer_test.cc | 2 +- util/auto_roll_logger.h | 2 +- util/auto_roll_logger_test.cc | 2 +- util/autovector_test.cc | 4 +- util/bloom.cc | 2 +- util/bloom_test.cc | 6 +- util/compression.h | 2 +- util/dynamic_bloom.cc | 2 +- util/dynamic_bloom_test.cc | 6 +- util/event_logger_test.cc | 2 +- util/file_reader_writer.cc | 2 +- util/file_reader_writer.h | 2 +- util/file_reader_writer_test.cc | 2 +- util/filelock_test.cc | 4 +- util/hash_test.cc | 2 +- util/log_buffer.h | 6 +- util/log_write_bench.cc | 4 +- util/rate_limiter.cc | 2 +- util/rate_limiter_test.cc | 2 +- util/repeatable_thread_test.cc | 2 +- util/thread_local_test.cc | 4 +- utilities/backupable/backupable_db.cc | 2 +- utilities/backupable/backupable_db_test.cc | 6 +- utilities/blob_db/blob_db_impl.cc | 2 +- utilities/blob_db/blob_db_test.cc | 6 +- .../cassandra/cassandra_functional_test.cc | 4 +- utilities/cassandra/format.h | 2 +- utilities/memory/memory_test.cc | 2 +- .../string_append/stringappend_test.cc | 4 +- utilities/options/options_util_test.cc | 2 +- .../persistent_cache/block_cache_tier.cc | 2 +- utilities/persistent_cache/block_cache_tier.h | 2 +- .../block_cache_tier_file_buffer.h | 2 +- utilities/persistent_cache/hash_table_test.cc | 4 +- .../persistent_cache/persistent_cache_test.h | 4 +- .../optimistic_transaction_test.cc | 6 +- .../transactions/pessimistic_transaction.cc | 2 +- .../pessimistic_transaction_db.cc | 2 +- .../transactions/transaction_lock_mgr.cc | 2 +- utilities/transactions/transaction_test.cc | 4 +- utilities/transactions/transaction_test.h | 4 +- .../write_prepared_transaction_test.cc | 6 +- .../transactions/write_prepared_txn_db.cc | 2 +- utilities/ttl/ttl_test.cc | 2 +- .../write_batch_with_index.cc | 2 +- .../write_batch_with_index_test.cc | 6 +- 196 files changed, 397 insertions(+), 413 deletions(-) rename {util => memory}/allocator.h (100%) rename {util => memory}/arena.cc (99%) rename {util => memory}/arena.h (99%) rename {util => memory}/arena_test.cc (99%) rename {util => memory}/concurrent_arena.cc (97%) rename {util => memory}/concurrent_arena.h (99%) rename {util => memory}/jemalloc_nodump_allocator.cc (99%) rename {util => memory}/jemalloc_nodump_allocator.h (100%) rename {util => memory}/memory_allocator.h (100%) rename {util => memory}/memory_usage.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5614c83b44a..c4dc2500fb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,6 +542,9 @@ set(SOURCES file/file_util.cc file/filename.cc file/sst_file_manager_impl.cc + memory/arena.cc + memory/concurrent_arena.cc + memory/jemalloc_nodump_allocator.cc memtable/alloc_tracker.cc memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc @@ -610,14 +613,12 @@ set(SOURCES tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc - util/arena.cc util/auto_roll_logger.cc util/bloom.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc util/compression_context_cache.cc - util/concurrent_arena.cc util/concurrent_task_limiter_impl.cc util/crc32c.cc util/dynamic_bloom.cc @@ -625,7 +626,6 @@ set(SOURCES util/file_reader_writer.cc util/filter_policy.cc util/hash.cc - util/jemalloc_nodump_allocator.cc util/log_buffer.cc util/murmurhash.cc util/random.cc @@ -932,6 +932,7 @@ if(WITH_TESTS) env/env_test.cc env/mock_env_test.cc file/delete_scheduler_test.cc + memory/arena_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc memtable/write_buffer_manager_test.cc @@ -955,7 +956,6 @@ if(WITH_TESTS) tools/reduce_levels_test.cc tools/sst_dump_test.cc tools/trace_analyzer_test.cc - util/arena_test.cc util/auto_roll_logger_test.cc util/autovector_test.cc util/bloom_test.cc diff --git a/Makefile b/Makefile index d41192ab2e0..244b929c418 100644 --- a/Makefile +++ b/Makefile @@ -1127,7 +1127,7 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) +arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/TARGETS b/TARGETS index 70d6e219413..a59af2fa697 100644 --- a/TARGETS +++ b/TARGETS @@ -147,6 +147,9 @@ cpp_library( "file/file_util.cc", "file/filename.cc", "file/sst_file_manager_impl.cc", + "memory/arena.cc", + "memory/concurrent_arena.cc", + "memory/jemalloc_nodump_allocator.cc", "memtable/alloc_tracker.cc", "memtable/hash_linklist_rep.cc", "memtable/hash_skiplist_rep.cc", @@ -181,8 +184,8 @@ cpp_library( "table/block_based/block_based_table_reader.cc", "table/block_based/block_builder.cc", "table/block_based/block_prefix_index.cc", - "table/block_based/data_block_hash_index.cc", "table/block_based/data_block_footer.cc", + "table/block_based/data_block_hash_index.cc", "table/block_based/flush_block_policy.cc", "table/block_based/full_filter_block.cc", "table/block_based/index_builder.cc", @@ -214,7 +217,6 @@ cpp_library( "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", - "util/arena.cc", "util/auto_roll_logger.cc", "util/bloom.cc", "util/build_version.cc", @@ -222,7 +224,6 @@ cpp_library( "util/compaction_job_stats_impl.cc", "util/comparator.cc", "util/compression_context_cache.cc", - "util/concurrent_arena.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", "util/dynamic_bloom.cc", @@ -230,7 +231,6 @@ cpp_library( "util/file_reader_writer.cc", "util/filter_policy.cc", "util/hash.cc", - "util/jemalloc_nodump_allocator.cc", "util/log_buffer.cc", "util/murmurhash.cc", "util/random.cc", @@ -353,7 +353,7 @@ cpp_library( ROCKS_TESTS = [ [ "arena_test", - "util/arena_test.cc", + "memory/arena_test.cc", "serial", ], [ diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 377ae146876..0cc3d559502 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -16,9 +16,9 @@ #include #include "cache/clock_cache.h" #include "cache/lru_cache.h" +#include "test_util/testharness.h" #include "util/coding.h" #include "util/string_util.h" -#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/builder.cc b/db/builder.cc index 14160f64c75..86aac02ab74 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -32,9 +32,9 @@ #include "table/block_based/block_based_table_builder.h" #include "table/format.h" #include "table/internal_iterator.h" +#include "test_util/sync_point.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/column_family_test.cc b/db/column_family_test.cc index f5d57c35b78..21b3321bea6 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -20,12 +20,12 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "util/coding.h" #include "test_util/fault_injection_test_env.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/coding.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index b97fd064e70..438fdb7c96f 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -14,9 +14,9 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/compaction.cc b/db/compaction.cc index 00ebd28b087..089dd66848e 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -18,8 +18,8 @@ #include "db/column_family.h" #include "rocksdb/compaction_filter.h" -#include "util/string_util.h" #include "test_util/sync_point.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/compaction.h b/db/compaction.h index e9ded632503..598b08e7c65 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -9,8 +9,8 @@ #pragma once #include "db/version_set.h" +#include "memory/arena.h" #include "options/cf_options.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index b0a553136a3..99bb026b5a9 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -9,9 +9,9 @@ #include #include "port/port.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 9e22e161f28..92a6fab8da8 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -53,6 +53,7 @@ #include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/log_buffer.h" @@ -61,7 +62,6 @@ #include "util/random.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index daf41386690..35c1100f99b 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -52,15 +52,15 @@ #include "table/mock_table.h" #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/compression.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "test_util/sync_point.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators.h" #if !defined(IOS_CROSS_COMPILE) diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 4608cceeac1..93e55b7a03b 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -25,10 +25,10 @@ #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index c01f2884d4c..b25f6cb0890 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -22,10 +22,10 @@ #include "db/column_family.h" #include "file/filename.h" #include "monitoring/statistics.h" +#include "test_util/sync_point.h" #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 82fc16f4f5a..dd33009eb12 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -12,10 +12,10 @@ #include "db/compaction_picker_fifo.h" #include "db/compaction_picker_universal.h" -#include "util/logging.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/logging.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc index b8d23795fbc..20edd30748d 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction_picker_universal.cc @@ -22,10 +22,10 @@ #include "db/column_family.h" #include "file/filename.h" #include "monitoring/statistics.h" +#include "test_util/sync_point.h" #include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { namespace { diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index ba7042049cb..de55c706ab7 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -9,11 +9,11 @@ #include "memtable/stl_wrappers.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" #include "util/kv_map.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 130821ff997..53c4d42d28a 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -27,10 +27,9 @@ #include "rocksdb/write_batch.h" #include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" -#include "file/filename.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index f9efbc58503..6f60e2d7037 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -11,9 +11,9 @@ #include "table/cuckoo/cuckoo_table_factory.h" #include "table/cuckoo/cuckoo_table_reader.h" #include "table/meta_blocks.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 623836454db..3051e89cd37 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -13,9 +13,9 @@ #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/experimental.h" #include "rocksdb/utilities/convenience.h" -#include "util/concurrent_task_limiter_impl.h" #include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" +#include "util/concurrent_task_limiter_impl.h" namespace rocksdb { diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index a1a1c8f99d6..59757aeb9f7 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -22,8 +22,8 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/mutexlock.h" #include "test_util/sync_point.h" +#include "util/mutexlock.h" namespace rocksdb { diff --git a/db/db_impl.cc b/db/db_impl.cc index ec162bb961e..5534c225f4d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -84,6 +84,7 @@ #include "table/multiget_context.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "tools/sst_dump_tool_imp.h" #include "util/auto_roll_logger.h" #include "util/autovector.h" @@ -97,7 +98,6 @@ #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index c5cc0736665..c6025a8cc57 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -21,8 +21,8 @@ #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" -#include "util/concurrent_task_limiter_impl.h" #include "test_util/sync_point.h" +#include "util/concurrent_task_limiter_impl.h" namespace rocksdb { diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index db47d141655..5dae140c7ea 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -19,8 +19,8 @@ #include "options/options_helper.h" #include "rocksdb/wal_filter.h" #include "table/block_based/block_based_table_factory.h" -#include "util/rate_limiter.h" #include "test_util/sync_point.h" +#include "util/rate_limiter.h" namespace rocksdb { Options SanitizeOptions(const std::string& dbname, const Options& src) { diff --git a/db/db_iter.cc b/db/db_iter.cc index 8fc17e1446e..d953d365e0f 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -17,6 +17,7 @@ #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "file/filename.h" +#include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -24,7 +25,6 @@ #include "rocksdb/options.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/db/db_iter.h b/db/db_iter.h index 8d8af3fd292..85b546c544c 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -13,10 +13,10 @@ #include "db/db_impl.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" +#include "memory/arena.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index 8c3588e9abd..b864ac4eae1 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -8,9 +8,9 @@ #include "rocksdb/comparator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "test_util/testharness.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/testharness.h" #include "utilities/merge_operators.h" #ifdef GFLAGS diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 49e670abc28..1503886443b 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/statistics.h" #include "table/iterator_wrapper.h" #include "table/merging_iterator.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 37a9f1a365b..b899ba18b4a 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -19,9 +19,9 @@ #include "rocksdb/convenience.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/stats_history.h" -#include "util/random.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" +#include "util/random.h" namespace rocksdb { diff --git a/db/db_test.cc b/db/db_test.cc index bf0babd1a3a..debb2ba603e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -58,14 +58,14 @@ #include "table/mock_table.h" #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/compression.h" #include "util/file_reader_writer.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "test_util/sync_point.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_test_util.h b/db/db_test_util.h index 1882cde59dc..2af202fad96 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -46,14 +46,14 @@ #include "table/mock_table.h" #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" -#include "util/compression.h" #include "test_util/mock_time_env.h" +#include "util/compression.h" #include "util/mutexlock.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 322381b3867..9eca823c2b7 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -13,8 +13,8 @@ #include "port/port.h" #include "port/stack_trace.h" #include "test_util/fault_injection_test_env.h" -#include "util/string_util.h" #include "test_util/sync_point.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index e3f06fe6b65..f4665b06ca3 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -8,8 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#include "util/logging.h" #include "test_util/testharness.h" +#include "util/logging.h" namespace rocksdb { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 9c67102c5f0..280d269f1c6 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -20,10 +20,10 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 26cd1127b94..aec398552c7 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -22,9 +22,9 @@ #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" +#include "test_util/sync_point.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 330df7bfe48..00619d447d1 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -22,11 +22,11 @@ #include "rocksdb/table.h" #include "rocksdb/write_batch.h" #include "test_util/fault_injection_test_env.h" -#include "util/logging.h" -#include "util/mutexlock.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/logging.h" +#include "util/mutexlock.h" namespace rocksdb { diff --git a/db/file_indexer.h b/db/file_indexer.h index 1bef3aab0ca..2091f80292b 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -12,8 +12,8 @@ #include #include #include +#include "memory/arena.h" #include "port/port.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 754cb3c4651..6942aa682d6 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -7,8 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include #include "db/file_indexer.h" +#include #include "db/dbformat.h" #include "db/version_edit.h" #include "port/stack_trace.h" diff --git a/db/filename_test.cc b/db/filename_test.cc index dabe673d849..377d128fae0 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -11,8 +11,8 @@ #include "db/dbformat.h" #include "port/port.h" -#include "util/logging.h" #include "test_util/testharness.h" +#include "util/logging.h" namespace rocksdb { diff --git a/db/flush_job.cc b/db/flush_job.cc index c8729c66840..d4ae79ff29a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -45,13 +45,13 @@ #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/event_logger.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index d97ad9f0c2d..ef89199c98e 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -13,10 +13,10 @@ #include "rocksdb/cache.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index f95debec62c..9e0823366d0 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -21,8 +21,8 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/merging_iterator.h" -#include "util/string_util.h" #include "test_util/sync_point.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 146588d961c..fb73f458edd 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -10,12 +10,12 @@ #include #include +#include "db/dbformat.h" +#include "memory/arena.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" -#include "db/dbformat.h" #include "table/internal_iterator.h" -#include "util/arena.h" namespace rocksdb { diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 9d6851dab16..17b0ca16544 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -34,8 +34,8 @@ int main() { return 0; } #include "rocksdb/db.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "util/gflags_compat.h" #include "test_util/testharness.h" +#include "util/gflags_compat.h" const int MAX_SHARDS = 100000; diff --git a/db/listener_test.cc b/db/listener_test.cc index 881534a1f1d..81a0fa17678 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -24,15 +24,14 @@ #include "rocksdb/table_properties.h" #include "table/block_based/block_based_table_factory.h" #include "table/plain/plain_table_factory.h" -#include "file/filename.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "test_util/sync_point.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators.h" #ifndef ROCKSDB_LITE diff --git a/db/log_test.cc b/db/log_test.cc index 5b159acf21f..be7a3cbe7cf 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -10,12 +10,12 @@ #include "db/log_reader.h" #include "db/log_writer.h" #include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/random.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" namespace rocksdb { namespace log { diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 35e5019ca7e..1a69a89dea0 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -8,12 +8,12 @@ #include #include -#include "rocksdb/db.h" +#include "port/port.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" #include "rocksdb/slice.h" #include "rocksdb/write_batch.h" #include "test_util/testharness.h" -#include "port/port.h" using namespace rocksdb; diff --git a/db/memtable.cc b/db/memtable.cc index 0c706115de0..46acbbfa61a 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -19,6 +19,8 @@ #include "db/pinned_iterators_manager.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" +#include "memory/arena.h" +#include "memory/memory_usage.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "port/port.h" @@ -31,10 +33,8 @@ #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "table/merging_iterator.h" -#include "util/arena.h" #include "util/autovector.h" #include "util/coding.h" -#include "util/memory_usage.h" #include "util/mutexlock.h" #include "util/util.h" diff --git a/db/memtable.h b/db/memtable.h index 709e2061e5b..6b8c4141f5a 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -19,13 +19,13 @@ #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" #include "db/version_edit.h" +#include "memory/allocator.h" +#include "memory/concurrent_arena.h" #include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" -#include "util/allocator.h" -#include "util/concurrent_arena.h" #include "util/dynamic_bloom.h" #include "util/hash.h" diff --git a/db/memtable_list.cc b/db/memtable_list.cc index b50b58a1af7..2b4ac6b84da 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -22,9 +22,9 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "table/merging_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/log_buffer.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 59da8af1664..f55fbdc501a 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -13,9 +13,9 @@ #include "rocksdb/db.h" #include "rocksdb/status.h" #include "rocksdb/write_buffer_manager.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc index dc3624af53e..3386f9bd067 100644 --- a/db/merge_helper_test.cc +++ b/db/merge_helper_test.cc @@ -9,9 +9,9 @@ #include "db/merge_helper.h" #include "rocksdb/comparator.h" -#include "util/coding.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/coding.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/merge_test.cc b/db/merge_test.cc index d3dadaa5d30..13c35d2c017 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -7,6 +7,9 @@ #include #include +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -14,11 +17,8 @@ #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" -#include "db/dbformat.h" -#include "db/db_impl.h" -#include "db/write_batch_internal.h" -#include "utilities/merge_operators.h" #include "test_util/testharness.h" +#include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index c6e7d6af07a..655c659b44f 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -20,10 +20,10 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" using std::cerr; using std::cout; diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 42d592862c7..94eabff7ff5 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "test_util/testharness.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/testharness.h" #include "utilities/merge_operators.h" bool FLAGS_random_key = false; diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index bfeb54243d9..a73dd3cb431 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -29,12 +29,12 @@ #include "table/plain/plain_table_key_coding.h" #include "table/plain/plain_table_reader.h" #include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; diff --git a/db/prefix_test.cc b/db/prefix_test.cc index e8290e76bca..3f2e794a6c4 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -26,12 +26,12 @@ int main() { #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "test_util/testharness.h" #include "util/coding.h" #include "util/gflags_compat.h" #include "util/random.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/testharness.h" #include "utilities/merge_operators.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 54a86169b20..97ba6ca4f8a 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -23,10 +23,10 @@ int main() { #include "db/range_tombstone_fragmenter.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/random.h" #include "util/stop_watch.h" -#include "test_util/testutil.h" #include "util/gflags_compat.h" diff --git a/db/table_cache.cc b/db/table_cache.cc index 4efd3fdf759..14c0169c11a 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -22,10 +22,10 @@ #include "table/multiget_context.h" #include "table/table_builder.h" #include "table/table_reader.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 0705cc032fe..e818f46142c 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -13,16 +13,16 @@ #include "db/dbformat.h" #include "db/table_properties_collector.h" -#include "table/meta_blocks.h" #include "options/cf_options.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" +#include "table/meta_blocks.h" #include "table/plain/plain_table_factory.h" #include "table/table_builder.h" -#include "util/coding.h" -#include "util/file_reader_writer.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/coding.h" +#include "util/file_reader_writer.h" namespace rocksdb { diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 5c3bd686b1c..63067857420 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -6,10 +6,10 @@ #include #include "db/version_edit.h" #include "db/version_set.h" -#include "util/logging.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/logging.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/version_edit.cc b/db/version_edit.cc index 018517a1381..668ff60f103 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -11,10 +11,10 @@ #include "db/version_set.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/event_logger.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/version_edit.h b/db/version_edit.h index ee6499cdc3b..471b4e095ab 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -10,12 +10,12 @@ #pragma once #include #include +#include #include #include -#include -#include "rocksdb/cache.h" #include "db/dbformat.h" -#include "util/arena.h" +#include "memory/arena.h" +#include "rocksdb/cache.h" #include "util/autovector.h" namespace rocksdb { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 5f1ae98ba4f..23c63b7caea 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" -#include "util/coding.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/coding.h" namespace rocksdb { diff --git a/db/version_set.cc b/db/version_set.cc index 864fc975358..5d0529d2707 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -48,11 +48,11 @@ #include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 41c27fdab65..9b4072dc777 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -10,10 +10,10 @@ #include "db/version_set.h" #include "db/log_writer.h" #include "table/mock_table.h" -#include "util/logging.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/logging.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 20b5780c877..2fe5305f8d6 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -28,13 +28,13 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index b1478e26e54..c0c47b0c34b 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -19,10 +19,10 @@ #include "db/wal_manager.h" #include "env/mock_env.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 88c52522917..5de602cee81 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -18,8 +18,8 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" -#include "util/string_util.h" #include "test_util/testharness.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index dbb4759fa03..aa3d077c40d 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -13,12 +13,12 @@ #include "db/db_impl.h" #include "db/write_callback.h" +#include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/write_batch.h" -#include "port/port.h" -#include "util/random.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/random.h" using std::string; diff --git a/db/write_thread.cc b/db/write_thread.cc index 872d32ca81b..5ee9439048b 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -9,8 +9,8 @@ #include "db/column_family.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" -#include "util/random.h" #include "test_util/sync_point.h" +#include "util/random.h" namespace rocksdb { diff --git a/env/env.cc b/env/env.cc index dcf79fb7fe7..e5e0e99c0a0 100644 --- a/env/env.cc +++ b/env/env.cc @@ -10,11 +10,11 @@ #include "rocksdb/env.h" #include +#include "memory/arena.h" #include "options/db_options.h" #include "port/port.h" #include "port/sys_time.h" #include "rocksdb/options.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { diff --git a/env/env_posix.cc b/env/env_posix.cc index 3f75dd6893c..bf1a9e0e5c4 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -49,12 +49,12 @@ #include "port/port.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" #include "util/logging.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "util/thread_local.h" #include "util/threadpool_imp.h" diff --git a/env/env_test.cc b/env/env_test.cc index 852a99c1adc..615eca8b400 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -40,13 +40,13 @@ #include "env/env_chroot.h" #include "port/port.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/log_buffer.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "test_util/sync_point.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #ifdef OS_LINUX static const size_t kPageSize = sysconf(_SC_PAGESIZE); diff --git a/env/io_posix.cc b/env/io_posix.cc index 27198b1f975..313cbd8eee6 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -31,9 +31,9 @@ #include "monitoring/iostats_context_imp.h" #include "port/port.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) #define F_LINUX_SPECIFIC_BASE 1024 diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 44e3110d5e7..22f28f5375f 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -13,9 +13,9 @@ #include "file/sst_file_manager_impl.h" #include "port/port.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 122a5d6177e..510753b3b45 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -16,10 +16,10 @@ #include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" #ifndef ROCKSDB_LITE diff --git a/file/filename.cc b/file/filename.cc index ed19b4109ff..a8fb780054a 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -17,11 +17,11 @@ #include #include #include "rocksdb/env.h" +#include "test_util/sync_point.h" #include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index 9b7278c7d5b..d63170452c0 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -16,8 +16,8 @@ #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" -#include "util/mutexlock.h" #include "test_util/sync_point.h" +#include "util/mutexlock.h" namespace rocksdb { diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 9d5de9a2f86..c6b8a92390e 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -22,8 +22,8 @@ #include "rocksdb/write_buffer_manager.h" #include "rocksjni/portal.h" #include "table/scoped_arena_iterator.h" -#include "util/string_util.h" #include "test_util/testharness.h" +#include "util/string_util.h" /* * Class: org_rocksdb_WriteBatchTest diff --git a/util/allocator.h b/memory/allocator.h similarity index 100% rename from util/allocator.h rename to memory/allocator.h diff --git a/util/arena.cc b/memory/arena.cc similarity index 99% rename from util/arena.cc rename to memory/arena.cc index 67e8a4db782..b774225535e 100644 --- a/util/arena.cc +++ b/memory/arena.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena.h" +#include "memory/arena.h" #ifdef ROCKSDB_MALLOC_USABLE_SIZE #ifdef OS_FREEBSD #include @@ -21,8 +21,8 @@ #include #include "port/port.h" #include "rocksdb/env.h" -#include "util/logging.h" #include "test_util/sync_point.h" +#include "util/logging.h" namespace rocksdb { diff --git a/util/arena.h b/memory/arena.h similarity index 99% rename from util/arena.h rename to memory/arena.h index dc64154c857..fd97f57e1e5 100644 --- a/util/arena.h +++ b/memory/arena.h @@ -15,12 +15,12 @@ #ifndef OS_WIN #include #endif -#include -#include -#include #include #include -#include "util/allocator.h" +#include +#include +#include +#include "memory/allocator.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/util/arena_test.cc b/memory/arena_test.cc similarity index 99% rename from util/arena_test.cc rename to memory/arena_test.cc index 052f2a6d5db..18296d307d0 100644 --- a/util/arena_test.cc +++ b/memory/arena_test.cc @@ -7,9 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena.h" -#include "util/random.h" +#include "memory/arena.h" #include "test_util/testharness.h" +#include "util/random.h" namespace rocksdb { diff --git a/util/concurrent_arena.cc b/memory/concurrent_arena.cc similarity index 97% rename from util/concurrent_arena.cc rename to memory/concurrent_arena.cc index cef77d7e75f..722eb3b60bd 100644 --- a/util/concurrent_arena.cc +++ b/memory/concurrent_arena.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/concurrent_arena.h" +#include "memory/concurrent_arena.h" #include #include "port/port.h" #include "util/random.h" diff --git a/util/concurrent_arena.h b/memory/concurrent_arena.h similarity index 99% rename from util/concurrent_arena.h rename to memory/concurrent_arena.h index a6191100fd0..6b41ab02470 100644 --- a/util/concurrent_arena.h +++ b/memory/concurrent_arena.h @@ -11,9 +11,9 @@ #include #include #include +#include "memory/allocator.h" +#include "memory/arena.h" #include "port/likely.h" -#include "util/allocator.h" -#include "util/arena.h" #include "util/core_local.h" #include "util/mutexlock.h" #include "util/thread_local.h" diff --git a/util/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc similarity index 99% rename from util/jemalloc_nodump_allocator.cc rename to memory/jemalloc_nodump_allocator.cc index cdd08e932e3..1f58351bef6 100644 --- a/util/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/jemalloc_nodump_allocator.h" +#include "memory/jemalloc_nodump_allocator.h" #include #include diff --git a/util/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h similarity index 100% rename from util/jemalloc_nodump_allocator.h rename to memory/jemalloc_nodump_allocator.h diff --git a/util/memory_allocator.h b/memory/memory_allocator.h similarity index 100% rename from util/memory_allocator.h rename to memory/memory_allocator.h diff --git a/util/memory_usage.h b/memory/memory_usage.h similarity index 100% rename from util/memory_usage.h rename to memory/memory_usage.h diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index a1fa4938c52..ddd40aa059f 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include "memory/allocator.h" +#include "memory/arena.h" #include "rocksdb/write_buffer_manager.h" -#include "util/allocator.h" -#include "util/arena.h" namespace rocksdb { diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 878d2338356..e347abe6e69 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -10,13 +10,13 @@ #include #include #include "db/memtable.h" +#include "memory/arena.h" #include "memtable/skiplist.h" #include "monitoring/histogram.h" #include "port/port.h" #include "rocksdb/memtablerep.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "util/arena.h" #include "util/hash.h" namespace rocksdb { diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index d02919cd4ef..5c74657cd31 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -9,14 +9,14 @@ #include +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "port/port.h" #include "rocksdb/memtablerep.h" -#include "util/arena.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "port/port.h" #include "util/murmurhash.h" -#include "db/memtable.h" -#include "memtable/skiplist.h" namespace rocksdb { namespace { diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 1ef8f2b6dbc..c3adb2ddbd7 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -46,10 +46,10 @@ #include #include #include +#include "memory/allocator.h" #include "port/likely.h" #include "port/port.h" #include "rocksdb/slice.h" -#include "util/allocator.h" #include "util/coding.h" #include "util/random.h" diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index a2f62d5304a..9670f3fc64c 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -10,11 +10,11 @@ #include "memtable/inlineskiplist.h" #include #include +#include "memory/concurrent_arena.h" #include "rocksdb/env.h" -#include "util/concurrent_arena.h" +#include "test_util/testharness.h" #include "util/hash.h" #include "util/random.h" -#include "test_util/testharness.h" namespace rocksdb { diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index ae199096563..003d59b2a86 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -28,6 +28,7 @@ int main() { #include "db/dbformat.h" #include "db/memtable.h" +#include "memory/arena.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" @@ -35,11 +36,10 @@ int main() { #include "rocksdb/options.h" #include "rocksdb/slice_transform.h" #include "rocksdb/write_buffer_manager.h" -#include "util/arena.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::RegisterFlagValidator; diff --git a/memtable/skiplist.h b/memtable/skiplist.h index 47a89034eb9..275daa7940f 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -32,10 +32,10 @@ #pragma once #include -#include #include +#include +#include "memory/allocator.h" #include "port/port.h" -#include "util/allocator.h" #include "util/random.h" namespace rocksdb { diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 054e3c9df07..33cc19b2d38 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -9,11 +9,11 @@ #include "memtable/skiplist.h" #include +#include "memory/arena.h" #include "rocksdb/env.h" -#include "util/arena.h" +#include "test_util/testharness.h" #include "util/hash.h" #include "util/random.h" -#include "test_util/testharness.h" namespace rocksdb { diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 32870b127d2..3955217cce7 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "memtable/inlineskiplist.h" #include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/inlineskiplist.h" #include "rocksdb/memtablerep.h" -#include "util/arena.h" namespace rocksdb { namespace { diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index 827ab8a5d2b..e7acc94ad67 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -12,8 +12,8 @@ #include #include -#include "util/arena.h" #include "db/memtable.h" +#include "memory/arena.h" #include "memtable/stl_wrappers.h" #include "port/port.h" #include "util/mutexlock.h" diff --git a/options/options_helper.cc b/options/options_helper.cc index 0b531a6ec5e..388256abd9f 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -10,7 +10,6 @@ #include #include -#include "table/plain/plain_table_factory.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" @@ -23,6 +22,7 @@ #include "rocksdb/table.h" #include "rocksdb/utilities/object_registry.h" #include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "util/cast_util.h" #include "util/string_util.h" diff --git a/options/options_parser.cc b/options/options_parser.cc index 9ae3dfb2785..d5b0c25a32e 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -16,10 +16,10 @@ #include "options/options_helper.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/file_reader_writer.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "port/port.h" diff --git a/options/options_test.cc b/options/options_test.cc index 704b2db802b..429b607e4f9 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -27,11 +27,11 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/leveldb_options.h" #include "rocksdb/utilities/object_registry.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" #include "utilities/merge_operators/bytesxor.h" #ifndef GFLAGS diff --git a/port/win/env_default.cc b/port/win/env_default.cc index db64878bc02..584a524cf86 100644 --- a/port/win/env_default.cc +++ b/port/win/env_default.cc @@ -11,8 +11,8 @@ #include #include "port/win/env_win.h" -#include "util/compression_context_cache.h" #include "test_util/sync_point.h" +#include "util/compression_context_cache.h" #include "util/thread_local.h" namespace rocksdb { diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 15d1e711412..6fbf6fc6301 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -10,9 +10,9 @@ #include "port/win/io_win.h" #include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" #include "util/aligned_buffer.h" #include "util/coding.h" -#include "test_util/sync_point.h" namespace rocksdb { namespace port { diff --git a/src.mk b/src.mk index a0f4043bf76..c1ab36b8a61 100644 --- a/src.mk +++ b/src.mk @@ -71,6 +71,9 @@ LIB_SOURCES = \ file/file_util.cc \ file/filename.cc \ file/sst_file_manager_impl.cc \ + memory/arena.cc \ + memory/concurrent_arena.cc \ + memory/jemalloc_nodump_allocator.cc \ memtable/alloc_tracker.cc \ memtable/hash_linklist_rep.cc \ memtable/hash_skiplist_rep.cc \ @@ -135,7 +138,6 @@ LIB_SOURCES = \ test_util/sync_point_impl.cc \ test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ - util/arena.cc \ util/auto_roll_logger.cc \ util/bloom.cc \ util/build_version.cc \ @@ -143,7 +145,6 @@ LIB_SOURCES = \ util/compaction_job_stats_impl.cc \ util/comparator.cc \ util/compression_context_cache.cc \ - util/concurrent_arena.cc \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ util/dynamic_bloom.cc \ @@ -151,7 +152,6 @@ LIB_SOURCES = \ util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ - util/jemalloc_nodump_allocator.cc \ util/log_buffer.cc \ util/murmurhash.cc \ util/random.cc \ @@ -339,6 +339,7 @@ MAIN_SOURCES = \ env/env_basic_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ + memory/arena_test.cc \ memtable/inlineskiplist_test.cc \ memtable/memtablerep_bench.cc \ memtable/skiplist_test.cc \ @@ -367,7 +368,6 @@ MAIN_SOURCES = \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ tools/trace_analyzer_test.cc \ - util/arena_test.cc \ util/auto_roll_logger_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ diff --git a/table/block_based/block.h b/table/block_based/block.h index 8bf6f535612..3c54389b08a 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -22,16 +22,16 @@ #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" -#include "table/format.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "table/block_based/block_prefix_index.h" #include "table/block_based/data_block_hash_index.h" +#include "table/format.h" #include "table/internal_iterator.h" -#include "util/random.h" #include "test_util/sync_point.h" +#include "util/random.h" namespace rocksdb { diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index 8d074275ce6..e0ca57f1c51 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -10,11 +10,11 @@ #include "table/block_based/block_based_filter_block.h" #include "rocksdb/filter_policy.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/hash.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 034c6b238fd..9769e394f87 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -31,9 +31,9 @@ #include "rocksdb/table.h" #include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_reader.h" -#include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_builder.h" #include "table/block_based/filter_block.h" #include "table/block_based/full_filter_block.h" @@ -41,15 +41,14 @@ #include "table/format.h" #include "table/table_builder.h" +#include "memory/memory_allocator.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" - namespace rocksdb { extern const std::string kHashIndexPrefixesBlock; diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 0c580b445dd..a1ef3889112 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -14,11 +14,11 @@ #include #include -#include "table/meta_blocks.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/listener.h" #include "rocksdb/options.h" #include "rocksdb/status.h" +#include "table/meta_blocks.h" #include "table/table_builder.h" #include "util/compression.h" diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 609679394ea..121cc916e25 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -23,8 +23,8 @@ #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/flush_block_policy.h" -#include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "util/mutexlock.h" diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 725ecdb4e3f..944a1fde43e 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -15,10 +15,10 @@ #include #include -#include "table/block_fetcher.h" -#include "table/meta_blocks.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "table/block_fetcher.h" +#include "table/meta_blocks.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -45,12 +45,12 @@ #include "table/two_level_iterator.h" #include "monitoring/perf_context_imp.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "util/xxhash.h" namespace rocksdb { @@ -202,19 +202,18 @@ bool PrefixExtractorChanged(const TableProperties* table_properties, // it is owned by the reader or stored in the cache, or whether it is pinned // in the cache or not. class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { -public: - IndexReaderCommon(BlockBasedTable* t, - CachableEntry&& index_block) - : table_(t) - , index_block_(std::move(index_block)) - { + public: + IndexReaderCommon(BlockBasedTable* t, CachableEntry&& index_block) + : table_(t), index_block_(std::move(index_block)) { assert(table_ != nullptr); } -protected: + protected: static Status ReadIndexBlock(BlockBasedTable* table, - FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, - GetContext* get_context, CachableEntry* index_block); + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, + GetContext* get_context, + CachableEntry* index_block); BlockBasedTable* table() const { return table_; } @@ -230,7 +229,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { assert(table_->get_rep() != nullptr); const TableProperties* const properties = - table_->get_rep()->table_properties.get(); + table_->get_rep()->table_properties.get(); return properties == nullptr || !properties->index_key_is_user_key; } @@ -240,7 +239,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { assert(table_->get_rep() != nullptr); const TableProperties* const properties = - table_->get_rep()->table_properties.get(); + table_->get_rep()->table_properties.get(); return properties == nullptr || !properties->index_value_is_delta_encoded; } @@ -251,20 +250,20 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { size_t ApproximateIndexBlockMemoryUsage() const { assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); - return index_block_.GetOwnValue() ? - index_block_.GetValue()->ApproximateMemoryUsage() : 0; + return index_block_.GetOwnValue() + ? index_block_.GetValue()->ApproximateMemoryUsage() + : 0; } -private: + private: BlockBasedTable* table_; CachableEntry index_block_; }; Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( - BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - const ReadOptions& read_options, GetContext* get_context, - CachableEntry* index_block) { - + BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + CachableEntry* index_block) { PERF_TIMER_GUARD(read_index_block_nanos); assert(table != nullptr); @@ -275,27 +274,27 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( assert(rep != nullptr); constexpr bool is_index = true; - const Status s = BlockBasedTable::RetrieveBlock(prefetch_buffer, - rep, read_options, rep->footer.index_handle(), - UncompressionDict::GetEmptyDict(), index_block, is_index, get_context); + const Status s = BlockBasedTable::RetrieveBlock( + prefetch_buffer, rep, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), index_block, is_index, get_context); return s; } Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( - const ReadOptions& read_options, GetContext* get_context, - CachableEntry* index_block) const { - + const ReadOptions& read_options, GetContext* get_context, + CachableEntry* index_block) const { assert(index_block != nullptr); if (!index_block_.IsEmpty()) { - *index_block = CachableEntry(index_block_.GetValue(), - nullptr /* cache */, nullptr /* cache_handle */, false /* own_value */); + *index_block = + CachableEntry(index_block_.GetValue(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */); return Status::OK(); } - return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, - read_options, get_context, index_block); + return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, read_options, + get_context, index_block); } // Index that allows binary search lookup in a two-level index structure. @@ -335,10 +334,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context) override { - CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - &index_block); + const Status s = + GetOrReadIndexBlock(read_options, get_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -403,7 +401,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { if (!s.ok()) { ROCKS_LOG_WARN(rep->ioptions.info_log, "Error retrieving top-level index block while trying to " - "cache index partitions: %s", s.ToString().c_str()); + "cache index partitions: %s", + s.ToString().c_str()); return; } @@ -474,10 +473,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } private: - PartitionIndexReader(BlockBasedTable* t, - CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) - {} + PartitionIndexReader(BlockBasedTable* t, CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} std::unordered_map> partition_map_; }; @@ -521,8 +518,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context) override { CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - &index_block); + const Status s = + GetOrReadIndexBlock(read_options, get_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -558,8 +555,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { private: BinarySearchIndexReader(BlockBasedTable* t, CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) - {} + : IndexReaderCommon(t, std::move(index_block)) {} }; // Index that leverages an internal hash table to quicken the lookup for a given @@ -620,7 +616,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { const ImmutableCFOptions& ioptions = rep->ioptions; const PersistentCacheOptions& cache_options = rep->persistent_cache_options; MemoryAllocator* const memory_allocator = - GetMemoryAllocator(rep->table_options); + GetMemoryAllocator(rep->table_options); // Read contents for the blocks BlockContents prefixes_contents; @@ -661,8 +657,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context) override { CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - &index_block); + const Status s = + GetOrReadIndexBlock(read_options, get_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -673,8 +669,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { } Statistics* kNullStats = nullptr; - const bool total_order_seek = read_options.total_order_seek || - disable_prefix_seek; + const bool total_order_seek = + read_options.total_order_seek || disable_prefix_seek; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. auto it = index_block.GetValue()->NewIterator( @@ -703,10 +699,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { } private: - HashIndexReader(BlockBasedTable* t, - CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) - {} + HashIndexReader(BlockBasedTable* t, CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} std::unique_ptr prefix_index_; }; @@ -1439,7 +1433,6 @@ Status BlockBasedTable::GetDataBlockFromCache( const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index, GetContext* get_context) { - assert(block); assert(block->IsEmpty()); @@ -1933,7 +1926,6 @@ BlockBasedTable::GetUncompressionDict(Rep* rep, InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, GetContext* get_context) { - assert(rep_ != nullptr); assert(rep_->index_reader != nullptr); @@ -1963,11 +1955,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool no_io = (ro.read_tier == kBlockCacheTier); auto uncompression_dict_storage = - GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); + GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); const UncompressionDict& uncompression_dict = - uncompression_dict_storage.GetValue() == nullptr - ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.GetValue(); + uncompression_dict_storage.GetValue() == nullptr + ? UncompressionDict::GetEmptyDict() + : *uncompression_dict_storage.GetValue(); CachableEntry block; s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict, @@ -1988,12 +1980,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // 2. it's pointing to immortal source. If own_bytes is true then we are // not reading data from the original source, whether immortal or not. // Otherwise, the block is pinned iff the source is immortal. - const bool block_contents_pinned = block.IsCached() || - (!block.GetValue()->own_bytes() && rep->immortal_table); + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep->immortal_table); iter = block.GetValue()->NewIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, - index_key_is_full, block_contents_pinned); + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full, block_contents_pinned); if (!block.IsCached()) { if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { @@ -2015,7 +2008,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, next_cache_key_id_++); assert(end - cache_key <= - static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); + static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); const Slice unique_key(cache_key, static_cast(end - cache_key)); s = block_cache->Insert(unique_key, nullptr, block.GetValue()->ApproximateMemoryUsage(), @@ -2066,11 +2059,11 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( compressed_cache_key); } - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - rep, ro, block_entry, uncompression_dict, - !is_index ? - rep->table_options.read_amp_bytes_per_bit : 0, - is_index, get_context); + s = GetDataBlockFromCache( + key, ckey, block_cache, block_cache_compressed, rep, ro, block_entry, + uncompression_dict, + !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, is_index, + get_context); // Can't find the block from the cache. If I/O is allowed, read from the // file. @@ -2119,7 +2112,6 @@ Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, bool is_index, GetContext* get_context) { - assert(rep); assert(block_entry); assert(block_entry->IsEmpty()); @@ -2127,15 +2119,15 @@ Status BlockBasedTable::RetrieveBlock( Status s; if (!is_index || rep->table_options.cache_index_and_filter_blocks) { s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, - uncompression_dict, block_entry, - is_index, get_context); + uncompression_dict, block_entry, is_index, + get_context); if (!s.ok()) { return s; } if (block_entry->GetValue() != nullptr) { - assert (s.ok()); + assert(s.ok()); return s; } } @@ -2151,16 +2143,14 @@ Status BlockBasedTable::RetrieveBlock( { StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, - READ_BLOCK_GET_MICROS); - s = ReadBlockFromFile(rep->file.get(), prefetch_buffer, rep->footer, ro, - handle, &block, rep->ioptions, - rep->blocks_maybe_compressed, - rep->blocks_maybe_compressed, uncompression_dict, - rep->persistent_cache_options, - rep->get_global_seqno(is_index), - !is_index ? - rep->table_options.read_amp_bytes_per_bit : 0, - GetMemoryAllocator(rep->table_options)); + READ_BLOCK_GET_MICROS); + s = ReadBlockFromFile( + rep->file.get(), prefetch_buffer, rep->footer, ro, handle, &block, + rep->ioptions, rep->blocks_maybe_compressed, + rep->blocks_maybe_compressed, uncompression_dict, + rep->persistent_cache_options, rep->get_global_seqno(is_index), + !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, + GetMemoryAllocator(rep->table_options)); } if (!s.ok()) { @@ -2756,9 +2746,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, need_upper_bound_check = PrefixExtractorChanged( rep_->table_properties.get(), prefix_extractor); } - auto iiter = - NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, - get_context); + auto iiter = NewIndexIterator(read_options, need_upper_bound_check, + &iiter_on_stack, get_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2877,9 +2866,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, need_upper_bound_check = PrefixExtractorChanged( rep_->table_properties.get(), prefix_extractor); } - auto iiter = NewIndexIterator( - read_options, need_upper_bound_check, &iiter_on_stack, - sst_file_range.begin()->get_context); + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + sst_file_range.begin()->get_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -3105,9 +3094,9 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { } char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice cache_key = GetCacheKey(rep_->cache_key_prefix, - rep_->cache_key_prefix_size, handle, - cache_key_storage); + Slice cache_key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); Cache::Handle* const cache_handle = cache->Lookup(cache_key); if (cache_handle == nullptr) { @@ -3187,9 +3176,8 @@ Status BlockBasedTable::CreateIndexReader( ROCKS_LOG_WARN(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary search index."); - return BinarySearchIndexReader::Create(this, prefetch_buffer, - use_cache, prefetch, pin, - index_reader); + return BinarySearchIndexReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, index_reader); } meta_index_iter = meta_iter_guard.get(); } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 6d265ba755b..3af617fecfa 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -263,12 +263,12 @@ class BlockBasedTable : public TableReader { // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). - static Status RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const Rep* rep, - const ReadOptions& ro, const BlockHandle& handle, - const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, - GetContext* get_context); + static Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, + const Rep* rep, const ReadOptions& ro, + const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, bool is_index, + GetContext* get_context); // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file @@ -310,8 +310,8 @@ class BlockBasedTable : public TableReader { static Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, const Rep* rep, - const ReadOptions& read_options, - CachableEntry* block, const UncompressionDict& uncompression_dict, + const ReadOptions& read_options, CachableEntry* block, + const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, bool is_index = false, GetContext* get_context = nullptr); @@ -351,10 +351,10 @@ class BlockBasedTable : public TableReader { // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter // helps avoid re-reading meta index block if caller already created one. - Status CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, - InternalIterator* preloaded_meta_index_iter, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader); + Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, + bool use_cache, bool prefetch, bool pin, + IndexReader** index_reader); bool FullFilterKeyMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, diff --git a/table/block_based/block_prefix_index.cc b/table/block_based/block_prefix_index.cc index 0050f1f1e58..6e24f17cf68 100644 --- a/table/block_based/block_prefix_index.cc +++ b/table/block_based/block_prefix_index.cc @@ -7,10 +7,10 @@ #include +#include "memory/arena.h" #include "rocksdb/comparator.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "util/arena.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index a4c5678881e..2dab4627cb6 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -23,9 +23,9 @@ #include "table/block_based/block.h" #include "table/block_based/block_builder.h" #include "table/format.h" -#include "util/random.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" namespace rocksdb { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 204e92ecbe3..5ec0938714f 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - #include #include #include diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 8b01214c7eb..378cdacfff6 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -24,11 +24,11 @@ #include #include #include "db/dbformat.h" -#include "table/format.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "table/format.h" #include "table/multiget_context.h" #include "util/hash.h" diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 3e5d82733b0..61df028c920 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -11,12 +11,12 @@ #include #include +#include "db/dbformat.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "db/dbformat.h" -#include "util/hash.h" #include "table/block_based/filter_block.h" +#include "util/hash.h" namespace rocksdb { diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 8b99f54b03f..82c43b34ed6 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -7,11 +7,11 @@ #include "rocksdb/filter_policy.h" #include "table/full_filter_bits_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/hash.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index f11ecd4f4bc..738b9e3e099 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -17,8 +17,8 @@ #include "rocksdb/comparator.h" #include "rocksdb/flush_block_policy.h" -#include "table/format.h" #include "table/block_based/partitioned_filter_block.h" +#include "table/format.h" // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace rocksdb { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 735f1c6e3eb..6860bf82fec 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -15,8 +15,8 @@ #include "rocksdb/slice_transform.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" -#include "table/block_based/full_filter_block.h" #include "table/block_based/cachable_entry.h" +#include "table/block_based/full_filter_block.h" #include "util/autovector.h" namespace rocksdb { diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 2bcafa9771a..9a1a4d526f1 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -3,20 +3,19 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - #include #include "rocksdb/filter_policy.h" -#include "table/full_filter_bits_builder.h" #include "table/block_based/partitioned_filter_block.h" +#include "table/full_filter_bits_builder.h" #include "index_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/hash.h" #include "util/logging.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 6c663702900..263abbfcf80 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -12,6 +12,7 @@ #include #include +#include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" @@ -24,7 +25,6 @@ #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/logging.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 56b74b50427..6451d6d2acc 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include "memory/memory_allocator.h" #include "table/block_based/block.h" #include "table/format.h" -#include "util/memory_allocator.h" namespace rocksdb { diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 1467e2a8d1b..f9d46c03bd8 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -10,11 +10,11 @@ #include #include -#include "table/meta_blocks.h" #include "table/cuckoo/cuckoo_table_builder.h" -#include "util/file_reader_writer.h" +#include "table/meta_blocks.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" namespace rocksdb { extern const uint64_t kCuckooTableMagicNumber; diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 72885be940e..905528e9bbf 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -15,13 +15,13 @@ #include #include #include +#include "memory/arena.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" -#include "table/internal_iterator.h" -#include "table/meta_blocks.h" #include "table/cuckoo/cuckoo_table_factory.h" #include "table/get_context.h" -#include "util/arena.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" #include "util/coding.h" namespace rocksdb { diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index 71e231336c5..681e0dfdf3e 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -22,17 +22,17 @@ int main() { #include #include +#include "memory/arena.h" #include "table/cuckoo/cuckoo_table_builder.h" #include "table/cuckoo/cuckoo_table_factory.h" #include "table/cuckoo/cuckoo_table_reader.h" #include "table/get_context.h" #include "table/meta_blocks.h" -#include "util/arena.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/table/format.cc b/table/format.cc index 1adcce6f3f4..3f95fd4d44b 100644 --- a/table/format.cc +++ b/table/format.cc @@ -13,6 +13,7 @@ #include #include "block_fetcher.h" +#include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" @@ -24,7 +25,6 @@ #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/logging.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" diff --git a/table/format.h b/table/format.h index 84242303ec7..baad78070ca 100644 --- a/table/format.h +++ b/table/format.h @@ -22,11 +22,11 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "memory/memory_allocator.h" #include "options/cf_options.h" #include "port/port.h" // noexcept #include "table/persistent_cache_options.h" #include "util/file_reader_writer.h" -#include "util/memory_allocator.h" namespace rocksdb { diff --git a/table/get_context.h b/table/get_context.h index 8df343b3653..ddce33fb3be 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -4,8 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#include #include +#include #include "db/merge_context.h" #include "db/read_callback.h" #include "rocksdb/env.h" diff --git a/table/iterator.cc b/table/iterator.cc index 0475b9d1342..97a0cef5e08 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/iterator.h" +#include "memory/arena.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" namespace rocksdb { diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 85a2fcc0324..1a0d4df8995 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -12,6 +12,7 @@ #include #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" @@ -19,11 +20,10 @@ #include "table/internal_iterator.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" +#include "test_util/sync_point.h" #include "util/autovector.h" #include "util/heap.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" namespace rocksdb { // Without anonymous namespace here, we fail the warning -Wmissing-prototypes diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 9d56c5b9c29..341a1185579 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -16,9 +16,9 @@ #include "table/internal_iterator.h" #include "table/persistent_cache_helper.h" #include "table/table_properties_internal.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/table/mock_table.h b/table/mock_table.h index f99941863a9..42e28266d99 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -12,16 +12,16 @@ #include #include -#include "util/kv_map.h" #include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/table.h" #include "table/internal_iterator.h" #include "table/table_builder.h" #include "table/table_reader.h" -#include "util/mutexlock.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/kv_map.h" +#include "util/mutexlock.h" namespace rocksdb { namespace mock { diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 6160d7afd9e..4d50d817643 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -12,18 +12,18 @@ #include #include +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/plain/plain_table_factory.h" -#include "db/dbformat.h" #include "table/block_based/block_builder.h" #include "table/bloom_block.h" -#include "table/plain/plain_table_index.h" #include "table/format.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h index 1457fd00d81..7c8ed1953b0 100644 --- a/table/plain/plain_table_index.h +++ b/table/plain/plain_table_index.h @@ -11,10 +11,10 @@ #include #include "db/dbformat.h" +#include "memory/arena.h" #include "monitoring/histogram.h" #include "options/cf_options.h" #include "rocksdb/options.h" -#include "util/arena.h" #include "util/hash.h" #include "util/murmurhash.h" diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 9c4b614b549..c84f337eb42 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -9,8 +9,8 @@ #include #include #include "db/dbformat.h" -#include "table/plain/plain_table_reader.h" #include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_reader.h" #include "util/file_reader_writer.h" namespace rocksdb { diff --git a/table/plain/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h index 26af3f6d8bd..5f65d5a6560 100644 --- a/table/plain/plain_table_key_coding.h +++ b/table/plain/plain_table_key_coding.h @@ -8,8 +8,8 @@ #ifndef ROCKSDB_LITE #include -#include "rocksdb/slice.h" #include "db/dbformat.h" +#include "rocksdb/slice.h" #include "table/plain/plain_table_reader.h" // The file contains three helper classes of PlainTable format, diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index b4aad55876b..38852059bf9 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -20,19 +20,19 @@ #include "rocksdb/statistics.h" #include "table/block_based/block.h" -#include "table/bloom_block.h" #include "table/block_based/filter_block.h" +#include "table/bloom_block.h" #include "table/format.h" +#include "table/get_context.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" -#include "table/two_level_iterator.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_key_coding.h" -#include "table/get_context.h" +#include "table/two_level_iterator.h" +#include "memory/arena.h" #include "monitoring/histogram.h" #include "monitoring/perf_context_imp.h" -#include "util/arena.h" #include "util/coding.h" #include "util/dynamic_bloom.h" #include "util/hash.h" diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index ec6e6a7febb..6c1c12ab8bb 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -13,15 +13,15 @@ #include #include "db/dbformat.h" +#include "memory/arena.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/table_reader.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_index.h" -#include "util/arena.h" +#include "table/table_reader.h" #include "util/dynamic_bloom.h" #include "util/file_reader_writer.h" diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 69993492d48..b53f3161e3e 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -11,8 +11,8 @@ #include "rocksdb/table.h" #include "table/block_based/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" -#include "util/file_reader_writer.h" #include "test_util/sync_point.h" +#include "util/file_reader_writer.h" namespace rocksdb { diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index f2ae016c10d..822c2294bb7 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -22,10 +22,10 @@ int main() { #include "table/internal_iterator.h" #include "table/plain/plain_table_factory.h" #include "table/table_builder.h" -#include "util/file_reader_writer.h" -#include "util/gflags_compat.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" +#include "util/gflags_compat.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/table/table_test.cc b/table/table_test.cc index 372443b536a..c59c9d8c33f 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -46,12 +46,12 @@ #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" -#include "util/compression.h" -#include "util/random.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/compression.h" +#include "util/random.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index ba883763e9f..7ff73cd4e4f 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -9,11 +9,11 @@ #include "table/two_level_iterator.h" #include "db/pinned_iterators_manager.h" +#include "memory/arena.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based/block.h" #include "table/format.h" -#include "util/arena.h" namespace rocksdb { diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 12caa2809ad..30aafb66069 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -60,6 +60,8 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/write_batch.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" #include "util/cast_util.h" #include "util/compression.h" #include "util/crc32c.h" @@ -68,8 +70,6 @@ #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "test_util/testutil.h" -#include "test_util/transaction_test_util.h" #include "util/xxhash.h" #include "utilities/blob_db/blob_db.h" #include "utilities/merge_operators.h" diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index 52a1f9b91eb..4eb5472acec 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -10,9 +10,9 @@ #include "rocksdb/db_bench_tool.h" #include "options/options_parser.h" #include "rocksdb/utilities/options_util.h" -#include "util/random.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" #ifdef GFLAGS #include "util/gflags_compat.h" diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index 41ae4c2761e..c7ad71738fa 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -18,8 +18,8 @@ int main() { #include "db/write_batch_internal.h" #include "rocksdb/db.h" #include "rocksdb/types.h" -#include "util/gflags_compat.h" #include "test_util/testutil.h" +#include "util/gflags_compat.h" // Run a thread to perform Put's. // Another thread uses GetUpdatesSince API to keep getting the updates. diff --git a/tools/db_stress.cc b/tools/db_stress.cc index b9ab1a2df11..c112cb348ff 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -1977,8 +1977,9 @@ class StressTest { } // Check if the multiget batch crossed the ops_per_open boundary. If it // did, then we should vote to reopen - if (i != 0 && (i % ops_per_open == 0 || - i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) { + if (i != 0 && + (i % ops_per_open == 0 || + i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) { { thread->stats.FinishedSingleOp(); MutexLock l(thread->shared->GetMutex()); @@ -2173,7 +2174,7 @@ class StressTest { snap_state); } while (!thread->snapshot_queue.empty() && - i >= thread->snapshot_queue.front().first) { + i >= thread->snapshot_queue.front().first) { auto snap_state = thread->snapshot_queue.front().second; assert(snap_state.snapshot); // Note: this is unsafe as the cf might be dropped concurrently. But it @@ -2202,7 +2203,7 @@ class StressTest { // number of ops multiget_batch_size = static_cast( std::min(static_cast(thread->rand.Uniform(64)), - FLAGS_ops_per_thread - i - 1)); + FLAGS_ops_per_thread - i - 1)); // If its the last iteration, ensure that multiget_batch_size is 1 multiget_batch_size = std::max(multiget_batch_size, 1); rand_keys = GenerateNKeys(thread, multiget_batch_size, i); diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index a76416b6c1d..3aa0e3cf36d 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -10,10 +10,10 @@ #include "db/version_set.h" #include "rocksdb/db.h" #include "rocksdb/utilities/ldb_cmd.h" -#include "tools/ldb_cmd_impl.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "tools/ldb_cmd_impl.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index ea27f3c8d45..d3b1f0e581d 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -15,9 +15,9 @@ #include "rocksdb/filter_policy.h" #include "table/block_based/block_based_table_factory.h" #include "table/table_builder.h" -#include "util/file_reader_writer.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" namespace rocksdb { diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index 2f31c5d8249..7c242f60f26 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -27,9 +27,9 @@ int main() { #include "rocksdb/env.h" #include "rocksdb/status.h" #include "rocksdb/trace_reader_writer.h" -#include "tools/trace_analyzer_tool.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "tools/trace_analyzer_tool.h" #include "util/trace_replay.h" namespace rocksdb { diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index 5a2049b6405..a5b2139fcaf 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -13,8 +13,8 @@ #include "file/filename.h" #include "port/port.h" #include "port/util_logger.h" -#include "util/mutexlock.h" #include "test_util/sync_point.h" +#include "util/mutexlock.h" namespace rocksdb { diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc index 3adbdbb1363..87de5ed5b9f 100644 --- a/util/auto_roll_logger_test.cc +++ b/util/auto_roll_logger_test.cc @@ -19,9 +19,9 @@ #include #include "port/port.h" #include "rocksdb/db.h" -#include "util/logging.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/logging.h" namespace rocksdb { namespace { diff --git a/util/autovector_test.cc b/util/autovector_test.cc index edb7af9eaf2..6b1b36e8d18 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -9,10 +9,10 @@ #include #include "rocksdb/env.h" -#include "util/autovector.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/autovector.h" +#include "util/string_util.h" using std::cout; using std::endl; diff --git a/util/bloom.cc b/util/bloom.cc index bedf4a65839..953a42fa213 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -11,8 +11,8 @@ #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" -#include "table/full_filter_bits_builder.h" #include "table/block_based/full_filter_block.h" +#include "table/full_filter_bits_builder.h" #include "util/coding.h" #include "util/hash.h" diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 87cd9da5569..7a13728308c 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -17,13 +17,13 @@ int main() { #include +#include "memory/arena.h" #include "rocksdb/filter_policy.h" #include "table/full_filter_bits_builder.h" -#include "util/arena.h" -#include "util/gflags_compat.h" -#include "util/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/gflags_compat.h" +#include "util/logging.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/compression.h b/util/compression.h index b901ceb3518..aa8af74499b 100644 --- a/util/compression.h +++ b/util/compression.h @@ -20,11 +20,11 @@ #endif // ROCKSDB_MALLOC_USABLE_SIZE #include +#include "memory/memory_allocator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "util/coding.h" #include "util/compression_context_cache.h" -#include "util/memory_allocator.h" #include "util/string_util.h" #ifdef SNAPPY diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 8e90efd89a7..4dfccb0bf36 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -7,9 +7,9 @@ #include +#include "memory/allocator.h" #include "port/port.h" #include "rocksdb/slice.h" -#include "util/allocator.h" #include "util/hash.h" namespace rocksdb { diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index a8a7000f648..036e0128008 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -24,13 +24,13 @@ int main() { #include #include "dynamic_bloom.h" +#include "memory/arena.h" #include "port/port.h" -#include "util/arena.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/logging.h" #include "util/stop_watch.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/event_logger_test.cc b/util/event_logger_test.cc index 16c6c59f70e..1ee0c4d9787 100644 --- a/util/event_logger_test.cc +++ b/util/event_logger_test.cc @@ -5,8 +5,8 @@ #include -#include "util/event_logger.h" #include "test_util/testharness.h" +#include "util/event_logger.h" namespace rocksdb { diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 3003a1ebac0..2c4e0a39f67 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -15,9 +15,9 @@ #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "test_util/sync_point.h" #include "util/random.h" #include "util/rate_limiter.h" -#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 317c1d6c78c..5ec332fc7a1 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -15,8 +15,8 @@ #include "rocksdb/env.h" #include "rocksdb/listener.h" #include "rocksdb/rate_limiter.h" -#include "util/aligned_buffer.h" #include "test_util/sync_point.h" +#include "util/aligned_buffer.h" namespace rocksdb { diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 18bb65a72bb..a4a9458d642 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -6,9 +6,9 @@ #include "util/file_reader_writer.h" #include #include -#include "util/random.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" namespace rocksdb { diff --git a/util/filelock_test.cc b/util/filelock_test.cc index bd0fc7c4221..3244563d7c3 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -6,10 +6,10 @@ #include "rocksdb/status.h" #include "rocksdb/env.h" -#include #include -#include "util/coding.h" +#include #include "test_util/testharness.h" +#include "util/coding.h" namespace rocksdb { diff --git a/util/hash_test.cc b/util/hash_test.cc index 6618c5a4bc1..8973f926bc3 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -9,8 +9,8 @@ #include -#include "util/hash.h" #include "test_util/testharness.h" +#include "util/hash.h" // The hash algorithm is part of the file format, for example for the Bloom // filters. Test that the hash values are stable for a set of random strings of diff --git a/util/log_buffer.h b/util/log_buffer.h index e356b93a746..16fb243117d 100644 --- a/util/log_buffer.h +++ b/util/log_buffer.h @@ -5,11 +5,11 @@ #pragma once +#include +#include "memory/arena.h" +#include "port/sys_time.h" #include "rocksdb/env.h" -#include "util/arena.h" #include "util/autovector.h" -#include "port/sys_time.h" -#include namespace rocksdb { diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index dd5322151e3..9efa43f8a3c 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -13,10 +13,10 @@ int main() { #include "monitoring/histogram.h" #include "rocksdb/env.h" -#include "util/file_reader_writer.h" -#include "util/gflags_compat.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/file_reader_writer.h" +#include "util/gflags_compat.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index 93665837fc4..0ee06a121ba 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -11,8 +11,8 @@ #include "monitoring/statistics.h" #include "port/port.h" #include "rocksdb/env.h" -#include "util/aligned_buffer.h" #include "test_util/sync_point.h" +#include "util/aligned_buffer.h" namespace rocksdb { diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 3316a75b571..d9f17cc3ac6 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -19,9 +19,9 @@ #include "db/db_test_util.h" #include "rocksdb/env.h" -#include "util/random.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/random.h" namespace rocksdb { diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc index 29af340d7cb..8a993e3699e 100644 --- a/util/repeatable_thread_test.cc +++ b/util/repeatable_thread_test.cc @@ -7,9 +7,9 @@ #include #include "db/db_test_util.h" -#include "util/repeatable_thread.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/repeatable_thread.h" class RepeatableThreadTest : public testing::Test { public: diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 787638138c0..9926c391745 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -7,12 +7,12 @@ #include #include -#include "rocksdb/env.h" #include "port/port.h" -#include "util/autovector.h" +#include "rocksdb/env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/autovector.h" #include "util/thread_local.h" namespace rocksdb { diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 816c9718b2d..dcd88ffdb8c 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -14,13 +14,13 @@ #include "port/port.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/transaction_log.h" +#include "test_util/sync_point.h" #include "util/channel.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/logging.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "utilities/checkpoint/checkpoint_impl.h" #ifndef __STDC_FORMAT_MACROS diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index c7377064f82..05006d6a3eb 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -22,14 +22,14 @@ #include "rocksdb/types.h" #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/options_util.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/file_reader_writer.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "test_util/sync_point.h" -#include "test_util/testharness.h" -#include "test_util/testutil.h" namespace rocksdb { diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index a93169c30cd..04b7eb73e2b 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -27,6 +27,7 @@ #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_builder.h" #include "table/meta_blocks.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" @@ -34,7 +35,6 @@ #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" #include "util/timer_queue.h" #include "utilities/blob_db/blob_compaction_filter.h" #include "utilities/blob_db/blob_db_iterator.h" diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 19dce3f87d7..19b8b0c727a 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -18,12 +18,12 @@ #include "file/sst_file_manager_impl.h" #include "port/port.h" #include "rocksdb/utilities/debug.h" -#include "util/cast_util.h" #include "test_util/fault_injection_test_env.h" -#include "util/random.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/random.h" +#include "util/string_util.h" #include "utilities/blob_db/blob_db.h" #include "utilities/blob_db/blob_db_impl.h" #include "utilities/blob_db/blob_index.h" diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index 347846d075c..431ef697929 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -4,16 +4,16 @@ // (found in the LICENSE.Apache file in the root directory). #include -#include "rocksdb/db.h" #include "db/db_impl.h" +#include "rocksdb/db.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" #include "test_util/testharness.h" #include "util/random.h" -#include "utilities/merge_operators.h" #include "utilities/cassandra/cassandra_compaction_filter.h" #include "utilities/cassandra/merge_operator.h" #include "utilities/cassandra/test_utils.h" +#include "utilities/merge_operators.h" using namespace rocksdb; diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h index 562c1aff3ff..b7f6e32f6ba 100644 --- a/utilities/cassandra/format.h +++ b/utilities/cassandra/format.h @@ -56,8 +56,8 @@ #pragma once #include -#include #include +#include #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "test_util/testharness.h" diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 224f7886bf1..29903d460f2 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -11,9 +11,9 @@ #include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/stackable_db.h" #include "table/block_based/block_based_table_factory.h" -#include "util/string_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/string_util.h" namespace rocksdb { diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 160bd347bd2..f0b83f621eb 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -12,11 +12,11 @@ #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" +#include "test_util/testharness.h" +#include "util/random.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/merge_operators/string_append/stringappend2.h" -#include "test_util/testharness.h" -#include "util/random.h" using namespace rocksdb; diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 342db490280..5b8015152ff 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/db.h" #include "rocksdb/table.h" #include "rocksdb/utilities/options_util.h" -#include "util/random.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" #ifndef GFLAGS bool FLAGS_enable_print = false; diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index 775ef29cf8d..5baf64772cc 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -11,9 +11,9 @@ #include #include "port/port.h" +#include "test_util/sync_point.h" #include "util/logging.h" #include "util/stop_watch.h" -#include "test_util/sync_point.h" #include "utilities/persistent_cache/block_cache_tier_file.h" namespace rocksdb { diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h index 670463a87f9..00dd9a173e9 100644 --- a/utilities/persistent_cache/block_cache_tier.h +++ b/utilities/persistent_cache/block_cache_tier.h @@ -27,10 +27,10 @@ #include "utilities/persistent_cache/block_cache_tier_metadata.h" #include "utilities/persistent_cache/persistent_cache_util.h" +#include "memory/arena.h" #include "memtable/skiplist.h" #include "monitoring/histogram.h" #include "port/port.h" -#include "util/arena.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/mutexlock.h" diff --git a/utilities/persistent_cache/block_cache_tier_file_buffer.h b/utilities/persistent_cache/block_cache_tier_file_buffer.h index 9d9465c6ca9..e4f8f5ba4b2 100644 --- a/utilities/persistent_cache/block_cache_tier_file_buffer.h +++ b/utilities/persistent_cache/block_cache_tier_file_buffer.h @@ -9,7 +9,7 @@ #include #include "include/rocksdb/comparator.h" -#include "util/arena.h" +#include "memory/arena.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index 51ad211e929..9cc1534973e 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -9,9 +9,9 @@ #include #include "db/db_test_util.h" -#include "util/arena.h" -#include "util/random.h" +#include "memory/arena.h" #include "test_util/testharness.h" +#include "util/random.h" #include "utilities/persistent_cache/hash_table.h" #include "utilities/persistent_cache/hash_table_evictable.h" diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h index 6d15d13b69b..29c334442c5 100644 --- a/utilities/persistent_cache/persistent_cache_test.h +++ b/utilities/persistent_cache/persistent_cache_test.h @@ -19,10 +19,10 @@ #include #include "db/db_test_util.h" +#include "memory/arena.h" +#include "port/port.h" #include "rocksdb/cache.h" #include "table/block_based/block_builder.h" -#include "port/port.h" -#include "util/arena.h" #include "test_util/testharness.h" #include "utilities/persistent_cache/volatile_tier_impl.h" diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index e3105a2139c..4f075d0d9fc 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -9,15 +9,15 @@ #include #include +#include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" +#include "test_util/testharness.h" +#include "test_util/transaction_test_util.h" #include "util/crc32c.h" #include "util/logging.h" #include "util/random.h" -#include "test_util/testharness.h" -#include "test_util/transaction_test_util.h" -#include "port/port.h" using std::string; diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index fd9da17aac4..93d75a8357f 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -19,9 +19,9 @@ #include "rocksdb/snapshot.h" #include "rocksdb/status.h" #include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction_db.h" #include "utilities/transactions/transaction_util.h" diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 95c88594ca9..8920f85fb76 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -20,9 +20,9 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/mutexlock.h" -#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/transaction_db_mutex_impl.h" #include "utilities/transactions/write_prepared_txn_db.h" diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index 173e012d88a..757b77fde4e 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -23,9 +23,9 @@ #include "monitoring/perf_context_imp.h" #include "rocksdb/slice.h" #include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/hash.h" -#include "test_util/sync_point.h" #include "util/thread_local.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index d183401f42f..0750b249bbb 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -24,12 +24,12 @@ #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" #include "test_util/fault_injection_test_env.h" -#include "util/random.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 8dfa6b053c5..22dc208f523 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -22,12 +22,12 @@ #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" #include "test_util/fault_injection_test_env.h" -#include "util/random.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 5287cca2038..c0a7e278054 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -28,13 +28,13 @@ #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" #include "test_util/fault_injection_test_env.h" -#include "util/mutexlock.h" -#include "util/random.h" -#include "util/string_util.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "test_util/transaction_test_util.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 0508a596e43..b4a71f5ea6c 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -21,10 +21,10 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "test_util/sync_point.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/transaction_db_mutex_impl.h" diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index c7d8f52aa52..38c6affab8f 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -9,8 +9,8 @@ #include #include "rocksdb/compaction_filter.h" #include "rocksdb/utilities/db_ttl.h" -#include "util/string_util.h" #include "test_util/testharness.h" +#include "util/string_util.h" #ifndef OS_WIN #include #endif diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index adec3475cdd..0f8f6c1d622 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -13,11 +13,11 @@ #include "db/db_impl.h" #include "db/merge_context.h" #include "db/merge_helper.h" +#include "memory/arena.h" #include "memtable/skiplist.h" #include "options/db_options.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" -#include "util/arena.h" #include "util/cast_util.h" #include "util/string_util.h" #include "utilities/write_batch_with_index/write_batch_with_index_internal.h" diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index f8875d9ac1f..3e0a33c3525 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -9,14 +9,14 @@ #ifndef ROCKSDB_LITE -#include +#include "rocksdb/utilities/write_batch_with_index.h" #include +#include #include "db/column_family.h" #include "port/stack_trace.h" -#include "rocksdb/utilities/write_batch_with_index.h" +#include "test_util/testharness.h" #include "util/random.h" #include "util/string_util.h" -#include "test_util/testharness.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" From b9f590065872db9b818874ba4bf4402ddd476cc3 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 30 May 2019 19:29:34 -0700 Subject: [PATCH 089/572] Fix WAL replay by skipping old write batches (#5170) Summary: 1. Fix a bug in WAL replay in which write batches with old sequence numbers are mistakenly inserted into memtables. 2. Add support for benchmarking secondary instance to db_bench_tool. With changes made in this PR, we can start benchmarking secondary instance using two processes. It is also possible to vary the frequency at which the secondary instance tries to catch up with the primary. The info log of the secondary can be found in a directory whose path can be specified with '-secondary_path'. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5170 Differential Revision: D15564608 Pulled By: riversand963 fbshipit-source-id: ce97688ed3d33f69d3a0b9266ebbbbf887aa0ec8 --- HISTORY.md | 2 +- db/db_impl_secondary.cc | 52 +++++++++---------- db/db_secondary_test.cc | 49 ++++++++++++++++++ tools/db_bench_tool.cc | 107 +++++++++++++++++++++++++++++++--------- 4 files changed, 159 insertions(+), 51 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 55366b006fc..f645d5cc268 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,7 +22,7 @@ * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. ### Bug Fixes - +* Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 586158ef7ce..a8ea921a260 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -102,7 +102,7 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { // numbers smaller than the smallest log in log_readers_, so there is no // need to pass these logs to RecoverLogFiles uint64_t log_number_min = 0; - if (log_readers_.size() > 0) { + if (!log_readers_.empty()) { log_number_min = log_readers_.begin()->first; } for (size_t i = 0; i < filenames.size(); i++) { @@ -202,11 +202,19 @@ Status DBImplSecondary::RecoverLogFiles( record.size(), Status::Corruption("log record too small")); continue; } + SequenceNumber seq = versions_->LastSequence(); WriteBatchInternal::SetContents(&batch, record); + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + // If the write batch's sequence number is smaller than the last sequence + // number of the db, then we should skip this write batch because its + // data must reside in an SST that has already been added in the prior + // MANIFEST replay. + if (seq_of_batch < seq) { + continue; + } std::vector column_family_ids; status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); if (status.ok()) { - SequenceNumber seq = versions_->LastSequence(); for (const auto id : column_family_ids) { ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(id); @@ -235,10 +243,13 @@ Status DBImplSecondary::RecoverLogFiles( cfd->SetMemtable(new_mem); } } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, true, log_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); } - // do not check sequence number because user may toggle disableWAL - // between writes which breaks sequence number continuity guarantee - // If column family was not found, it might mean that the WAL write // batch references to the column family that was dropped after the // insert. We don't want to fail the whole write batch in that case -- @@ -246,14 +257,6 @@ Status DBImplSecondary::RecoverLogFiles( // That's why we set ignore missing column families to true // passing null flush_scheduler will disable memtable flushing which is // needed for secondary instances - if (status.ok()) { - bool has_valid_writes = false; - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), - nullptr /* flush_scheduler */, true, log_number, this, - false /* concurrent_memtable_writes */, next_sequence, - &has_valid_writes, seq_per_batch_, batch_per_txn_); - } if (status.ok()) { for (const auto id : column_family_ids) { ColumnFamilyData* cfd = @@ -269,31 +272,28 @@ Status DBImplSecondary::RecoverLogFiles( iter->second = log_number; } } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } } else { // We are treating this as a failure while reading since we read valid // blocks that do not form coherent data reader->GetReporter()->Corruption(record.size(), status); - continue; } } - if (!status.ok()) { return status; } - - auto last_sequence = *next_sequence - 1; - if ((*next_sequence != kMaxSequenceNumber) && - (versions_->LastSequence() <= last_sequence)) { - versions_->SetLastAllocatedSequence(last_sequence); - versions_->SetLastPublishedSequence(last_sequence); - versions_->SetLastSequence(last_sequence); - } } // remove logreaders from map after successfully recovering the WAL if (log_readers_.size() > 1) { - auto eraseIter = log_readers_.begin(); - std::advance(eraseIter, log_readers_.size() - 1); - log_readers_.erase(log_readers_.begin(), eraseIter); + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); } return status; } diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 50a0923b4c8..23132434f1f 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -526,6 +526,55 @@ TEST_F(DBSecondaryTest, SwitchManifest) { } TEST_F(DBSecondaryTest, SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { const int kNumKeysPerMemtable = 1; const std::string kCFName1 = "pikachu"; Options options; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 30aafb66069..b98fb42c458 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -752,6 +752,19 @@ DEFINE_uint64(blob_db_bytes_per_sync, 0, "Bytes to sync blob file at."); DEFINE_uint64(blob_db_file_size, 256 * 1024 * 1024, "Target size of each blob file."); +// Secondary DB instance Options +DEFINE_bool(use_secondary_db, false, + "Open a RocksDB secondary instance. A primary instance can be " + "running in another db_bench process."); + +DEFINE_string(secondary_path, "", + "Path to a directory used by the secondary instance to store " + "private files, e.g. info log."); + +DEFINE_int32(secondary_update_interval, 5, + "Secondary instance attempts to catch up with the primary every " + "secondary_update_interval seconds."); + #endif // ROCKSDB_LITE DEFINE_bool(report_bg_io_stats, false, @@ -2571,36 +2584,38 @@ class Benchmark { return base_name + ToString(id); } -void VerifyDBFromDB(std::string& truth_db_name) { - DBWithColumnFamilies truth_db; - auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } - ReadOptions ro; - ro.total_order_seek = true; - std::unique_ptr truth_iter(truth_db.db->NewIterator(ro)); - std::unique_ptr db_iter(db_.db->NewIterator(ro)); - // Verify that all the key/values in truth_db are retrivable in db with ::Get - fprintf(stderr, "Verifying db >= truth_db with ::Get...\n"); - for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) { + void VerifyDBFromDB(std::string& truth_db_name) { + DBWithColumnFamilies truth_db; + auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr truth_iter(truth_db.db->NewIterator(ro)); + std::unique_ptr db_iter(db_.db->NewIterator(ro)); + // Verify that all the key/values in truth_db are retrivable in db with + // ::Get + fprintf(stderr, "Verifying db >= truth_db with ::Get...\n"); + for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) { std::string value; s = db_.db->Get(ro, truth_iter->key(), &value); assert(s.ok()); // TODO(myabandeh): provide debugging hints assert(Slice(value) == truth_iter->value()); + } + // Verify that the db iterator does not give any extra key/value + fprintf(stderr, "Verifying db == truth_db...\n"); + for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); + db_iter->Next(), truth_iter->Next()) { + assert(truth_iter->Valid()); + assert(truth_iter->value() == db_iter->value()); + } + // No more key should be left unchecked in truth_db + assert(!truth_iter->Valid()); + fprintf(stderr, "...Verified\n"); } - // Verify that the db iterator does not give any extra key/value - fprintf(stderr, "Verifying db == truth_db...\n"); - for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next(), truth_iter->Next()) { - assert(truth_iter->Valid()); - assert(truth_iter->value() == db_iter->value()); - } - // No more key should be left unchecked in truth_db - assert(!truth_iter->Valid()); - fprintf(stderr, "...Verified\n"); -} void Run() { if (!SanityCheck()) { @@ -2934,6 +2949,12 @@ void VerifyDBFromDB(std::string& truth_db_name) { } } + if (secondary_update_thread_) { + secondary_update_stopped_.store(1, std::memory_order_relaxed); + secondary_update_thread_->join(); + secondary_update_thread_.reset(); + } + #ifndef ROCKSDB_LITE if (name != "replay" && FLAGS_trace_file != "") { Status s = db_.db->EndTrace(); @@ -2953,10 +2974,17 @@ void VerifyDBFromDB(std::string& truth_db_name) { ->ToString() .c_str()); } + if (FLAGS_use_secondary_db) { + fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n", + secondary_db_updates_); + } } private: std::shared_ptr timestamp_emulator_; + std::unique_ptr secondary_update_thread_; + std::atomic secondary_update_stopped_{0}; + uint64_t secondary_db_updates_ = 0; struct ThreadArg { Benchmark* bm; @@ -3618,6 +3646,11 @@ void VerifyDBFromDB(std::string& truth_db_name) { fprintf(stderr, "Cannot use readonly flag with transaction_db\n"); exit(1); } + if (FLAGS_use_secondary_db && + (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) { + fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n"); + exit(1); + } #endif // ROCKSDB_LITE } @@ -3845,6 +3878,32 @@ void VerifyDBFromDB(std::string& truth_db_name) { if (s.ok()) { db->db = ptr; } + } else if (FLAGS_use_secondary_db) { + if (FLAGS_secondary_path.empty()) { + std::string default_secondary_path; + FLAGS_env->GetTestDirectory(&default_secondary_path); + default_secondary_path += "/dbbench_secondary"; + FLAGS_secondary_path = default_secondary_path; + } + s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db); + if (s.ok() && FLAGS_secondary_update_interval > 0) { + secondary_update_thread_.reset(new port::Thread( + [this](int interval, DBWithColumnFamilies* _db) { + while (0 == secondary_update_stopped_.load( + std::memory_order_relaxed)) { + Status secondary_update_status = + _db->db->TryCatchUpWithPrimary(); + if (!secondary_update_status.ok()) { + fprintf(stderr, "Failed to catch up with primary: %s\n", + secondary_update_status.ToString().c_str()); + break; + } + ++secondary_db_updates_; + FLAGS_env->SleepForMicroseconds(interval * 1000000); + } + }, + FLAGS_secondary_update_interval, db)); + } #endif // ROCKSDB_LITE } else { s = DB::Open(options, db_name, &db->db); From ff9d286877dd3ec74fc829cf57935bfb479a2182 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 30 May 2019 21:29:44 -0700 Subject: [PATCH 090/572] Reorder DBImpl's private section (#5385) Summary: The methods and fields in the private section of DBImpl were all intermingled, making it hard to figure out where the fields/methods start and where they end. I cleaned up the code a little so that all the type declaration are at the beginning, followed by methods, and all the data fields are at the end. This follows Pull Request resolved: https://github.com/facebook/rocksdb/pull/5385 Differential Revision: D15566978 Pulled By: sagar0 fbshipit-source-id: 4618a7d819ad4e2d7cc9ae1af2c59f400140bb1b --- db/db_impl.h | 376 ++++++++++++++++++++++++++------------------------- 1 file changed, 189 insertions(+), 187 deletions(-) diff --git a/db/db_impl.h b/db/db_impl.h index f2544e85941..4c418d6f38f 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -339,7 +339,7 @@ class DBImpl : public DB { TablePropertiesCollection* props) override; #endif // ROCKSDB_LITE - + // ---- End of implementations of the DB interface ---- // Function that Get and KeyMayExist call with no_io true or false @@ -372,14 +372,13 @@ class DBImpl : public DB { // depends also on data written to the WAL but not to the memtable. SequenceNumber TEST_GetLastVisibleSequence() const; -#ifndef ROCKSDB_LITE +#ifndef ROCKSDB_LITE // Similar to Write() but will call the callback once on the single write // thread to determine whether it is safe to perform the write. virtual Status WriteWithCallback(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback); - // Returns the sequence number that is guaranteed to be smaller than or equal // to the sequence number of any key that could be inserted into the current // memtables. It can then be assumed that any write with a larger(or equal) @@ -811,7 +810,7 @@ class DBImpl : public DB { size_t TEST_EstiamteStatsHistorySize() const; #endif // NDEBUG - + protected: Env* const env_; const std::string dbname_; @@ -1007,7 +1006,10 @@ class DBImpl : public DB { friend class DBBlobIndexTest; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; #endif + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; struct WriteContext { SuperVersionContext superversion_context; @@ -1024,8 +1026,138 @@ class DBImpl : public DB { } }; - struct PrepickedCompaction; - struct PurgeFileInfo; + // Class to maintain directories for all database paths other than main one. + class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + Directory* GetDataDir(size_t path_id) const; + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + // true for some prefix of logs_ + bool getting_synced = false; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_queue_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done; + bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; + }; Status ResumeImpl(); @@ -1079,34 +1211,6 @@ class DBImpl : public DB { SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, Env::Priority thread_pri); - // Argument required by background flush thread. - struct BGFlushArg { - BGFlushArg() - : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} - BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, - SuperVersionContext* superversion_context) - : cfd_(cfd), - max_memtable_id_(max_memtable_id), - superversion_context_(superversion_context) {} - - // Column family to flush. - ColumnFamilyData* cfd_; - // Maximum ID of memtable to flush. In this column family, memtables with - // IDs smaller than this value must be flushed before this flush completes. - uint64_t max_memtable_id_; - // Pointer to a SuperVersionContext object. After flush completes, RocksDB - // installs a new superversion for the column family. This operation - // requires a SuperVersionContext object (currently embedded in JobContext). - SuperVersionContext* superversion_context_; - }; - - // Argument passed to flush thread. - struct FlushThreadArg { - DBImpl* db_; - - Env::Priority thread_pri_; - }; - // Flush the memtables of (multiple) column families to multiple files on // persistent storage. Status FlushMemTablesToOutputFiles( @@ -1345,6 +1449,57 @@ class DBImpl : public DB { void WaitForBackgroundWork(); + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); +#ifndef ROCKSDB_LITE + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::list::iterator* pending_output_elem, + uint64_t* next_file_number); +#endif //! ROCKSDB_LITE + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; @@ -1390,37 +1545,7 @@ class DBImpl : public DB { // expesnive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; - struct LogFileNumberSize { - explicit LogFileNumberSize(uint64_t _number) : number(_number) {} - void AddSize(uint64_t new_size) { size += new_size; } - uint64_t number; - uint64_t size = 0; - bool getting_flushed = false; - }; - struct LogWriterNumber { - // pass ownership of _writer - LogWriterNumber(uint64_t _number, log::Writer* _writer) - : number(_number), writer(_writer) {} - log::Writer* ReleaseWriter() { - auto* w = writer; - writer = nullptr; - return w; - } - Status ClearWriter() { - Status s = writer->WriteBuffer(); - delete writer; - writer = nullptr; - return s; - } - - uint64_t number; - // Visual Studio doesn't support deque's member to be noncopyable because - // of a std::unique_ptr as a member. - log::Writer* writer; // own - // true for some prefix of logs_ - bool getting_synced = false; - }; // Without two_write_queues, read and writes to alive_log_files_ are // protected by mutex_. However since back() is never popped, and push_back() // is done only from write_thread_, the same thread can access the item @@ -1467,30 +1592,6 @@ class DBImpl : public DB { bool stats_slice_initialized_ = false; - // Class to maintain directories for all database paths other than main one. - class Directories { - public: - Status SetDirectories(Env* env, const std::string& dbname, - const std::string& wal_dir, - const std::vector& data_paths); - - Directory* GetDataDir(size_t path_id) const; - - Directory* GetWalDir() { - if (wal_dir_) { - return wal_dir_.get(); - } - return db_dir_.get(); - } - - Directory* GetDbDir() { return db_dir_.get(); } - - private: - std::unique_ptr db_dir_; - std::vector> data_dirs_; - std::unique_ptr wal_dir_; - }; - Directories directories_; WriteBufferManager* write_buffer_manager_; @@ -1526,19 +1627,6 @@ class DBImpl : public DB { // State is protected with db mutex. std::list pending_outputs_; - // PurgeFileInfo is a structure to hold information of files to be deleted in - // purge_queue_ - struct PurgeFileInfo { - std::string fname; - std::string dir_to_sync; - FileType type; - uint64_t number; - int job_id; - PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, - int jid) - : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} - }; - // flush_queue_ and compaction_queue_ hold column families that we need to // flush and compact, respectively. // A column family is inserted into flush_queue_ when it satisfies condition @@ -1595,42 +1683,8 @@ class DBImpl : public DB { // number of background obsolete file purge jobs, submitted to the HIGH pool int bg_purge_scheduled_; - // Information for a manual compaction - struct ManualCompactionState { - ColumnFamilyData* cfd; - int input_level; - int output_level; - uint32_t output_path_id; - Status status; - bool done; - bool in_progress; // compaction request being processed? - bool incomplete; // only part of requested range compacted - bool exclusive; // current behavior of only one manual - bool disallow_trivial_move; // Force actual compaction to run - const InternalKey* begin; // nullptr means beginning of key range - const InternalKey* end; // nullptr means end of key range - InternalKey* manual_end; // how far we are compacting - InternalKey tmp_storage; // Used to keep track of compaction progress - InternalKey tmp_storage1; // Used to keep track of compaction progress - }; - struct PrepickedCompaction { - // background compaction takes ownership of `compaction`. - Compaction* compaction; - // caller retains ownership of `manual_compaction_state` as it is reused - // across background compactions. - ManualCompactionState* manual_compaction_state; // nullptr if non-manual - // task limiter token is requested during compaction picking. - std::unique_ptr task_token; - }; std::deque manual_compaction_dequeue_; - struct CompactionArg { - // caller retains ownership of `db`. - DBImpl* db; - // background compaction takes ownership of `prepicked_compaction`. - PrepickedCompaction* prepicked_compaction; - }; - // shall we disable deletion of obsolete files // if 0 the deletion is enabled. // if non-zero, files will not be getting deleted @@ -1726,58 +1780,6 @@ class DBImpl : public DB { // REQUIRES: mutex locked std::unique_ptr thread_persist_stats_; - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - // Background threads call this function, which is just a wrapper around - // the InstallSuperVersion() function. Background threads carry - // sv_context which can have new_superversion already - // allocated. - // All ColumnFamily state changes go through this function. Here we analyze - // the new state and we schedule background work if we detect that the new - // state needs flush or compaction. - void InstallSuperVersionAndScheduleWork( - ColumnFamilyData* cfd, SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options); - - - bool GetIntPropertyInternal(ColumnFamilyData* cfd, - const DBPropertyInfo& property_info, - bool is_locked, uint64_t* value); - bool GetPropertyHandleOptionsStatistics(std::string* value); - - bool HasPendingManualCompaction(); - bool HasExclusiveManualCompaction(); - void AddManualCompaction(ManualCompactionState* m); - void RemoveManualCompaction(ManualCompactionState* m); - bool ShouldntRunManualCompaction(ManualCompactionState* m); - bool HaveManualCompaction(ColumnFamilyData* cfd); - bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); -#ifndef ROCKSDB_LITE - void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, - const Status& st, - const CompactionJobStats& compaction_job_stats, - const int job_id, const Version* current, - CompactionJobInfo* compaction_job_info) const; - // Reserve the next 'num' file numbers for to-be-ingested external SST files, - // and return the current file_number in 'next_file_number'. - // Write a version edit to the MANIFEST. - Status ReserveFileNumbersBeforeIngestion( - ColumnFamilyData* cfd, uint64_t num, - std::list::iterator* pending_output_elem, - uint64_t* next_file_number); -#endif //! ROCKSDB_LITE - - bool ShouldPurge(uint64_t file_number) const; - void MarkAsGrabbedForPurge(uint64_t file_number); - - size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } - - Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log); - // When set, we use a separate queue for writes that dont write to memtable. // In 2PC these are the writes at Prepare phase. const bool two_write_queues_; From ab8f6c01a6c48fd7b8c752a3ef0ef8640065dd48 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 30 May 2019 21:30:41 -0700 Subject: [PATCH 091/572] move LevelCompactionPicker to a separate file (#5369) Summary: In order to improve code readability, this PR moves LevelCompactionBuilder and LevelCompactionPicker to compaction_picker_level.h and .cc Pull Request resolved: https://github.com/facebook/rocksdb/pull/5369 Differential Revision: D15540172 Pulled By: miasantreble fbshipit-source-id: c1a578b93f127cd63661b53f32b356e6edd349af --- CMakeLists.txt | 3 +- TARGETS | 1 + db/column_family.cc | 1 + db/compaction_picker.cc | 534 -------------------------------- db/compaction_picker.h | 17 -- db/compaction_picker_level.cc | 558 ++++++++++++++++++++++++++++++++++ db/compaction_picker_level.h | 32 ++ db/compaction_picker_test.cc | 1 + src.mk | 1 + 9 files changed, 596 insertions(+), 552 deletions(-) create mode 100644 db/compaction_picker_level.cc create mode 100644 db/compaction_picker_level.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c4dc2500fb5..3ddea95deaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -485,6 +485,7 @@ set(SOURCES db/compaction_job.cc db/compaction_picker.cc db/compaction_picker_fifo.cc + db/compaction_picker_level.cc db/compaction_picker_universal.cc db/convenience.cc db/db_filesnapshot.cc @@ -945,7 +946,7 @@ if(WITH_TESTS) table/block_based/block_test.cc table/block_based/data_block_hash_index_test.cc table/block_based/full_filter_block_test.cc - table/block_based/partitioned_filter_block_test.cc + table/block_based/partitioned_filter_block_test.cc table/cleanable_test.cc table/cuckoo/cuckoo_table_builder_test.cc table/cuckoo/cuckoo_table_reader_test.cc diff --git a/TARGETS b/TARGETS index a59af2fa697..dc39f87bcef 100644 --- a/TARGETS +++ b/TARGETS @@ -88,6 +88,7 @@ cpp_library( "db/compaction_job.cc", "db/compaction_picker.cc", "db/compaction_picker_fifo.cc", + "db/compaction_picker_level.cc", "db/compaction_picker_universal.cc", "db/convenience.cc", "db/db_filesnapshot.cc", diff --git a/db/column_family.cc b/db/column_family.cc index 84f521cd7b8..fde1996aeaf 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -21,6 +21,7 @@ #include "db/compaction_picker.h" #include "db/compaction_picker_fifo.h" +#include "db/compaction_picker_level.h" #include "db/compaction_picker_universal.h" #include "db/db_impl.h" #include "db/internal_stats.h" diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index b25f6cb0890..bfe13828b18 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -1111,538 +1111,4 @@ bool CompactionPicker::GetOverlappingL0Files( return true; } -bool LevelCompactionPicker::NeedsCompaction( - const VersionStorageInfo* vstorage) const { - if (!vstorage->ExpiredTtlFiles().empty()) { - return true; - } - if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { - return true; - } - if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { - return true; - } - if (!vstorage->FilesMarkedForCompaction().empty()) { - return true; - } - for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { - if (vstorage->CompactionScore(i) >= 1) { - return true; - } - } - return false; -} - -namespace { -// A class to build a leveled compaction step-by-step. -class LevelCompactionBuilder { - public: - LevelCompactionBuilder(const std::string& cf_name, - VersionStorageInfo* vstorage, - CompactionPicker* compaction_picker, - LogBuffer* log_buffer, - const MutableCFOptions& mutable_cf_options, - const ImmutableCFOptions& ioptions) - : cf_name_(cf_name), - vstorage_(vstorage), - compaction_picker_(compaction_picker), - log_buffer_(log_buffer), - mutable_cf_options_(mutable_cf_options), - ioptions_(ioptions) {} - - // Pick and return a compaction. - Compaction* PickCompaction(); - - // Pick the initial files to compact to the next level. (or together - // in Intra-L0 compactions) - void SetupInitialFiles(); - - // If the initial files are from L0 level, pick other L0 - // files if needed. - bool SetupOtherL0FilesIfNeeded(); - - // Based on initial files, setup other files need to be compacted - // in this compaction, accordingly. - bool SetupOtherInputsIfNeeded(); - - Compaction* GetCompaction(); - - // For the specfied level, pick a file that we want to compact. - // Returns false if there is no file to compact. - // If it returns true, inputs->files.size() will be exactly one. - // If level is 0 and there is already a compaction on that level, this - // function will return false. - bool PickFileToCompact(); - - // For L0->L0, picks the longest span of files that aren't currently - // undergoing compaction for which work-per-deleted-file decreases. The span - // always starts from the newest L0 file. - // - // Intra-L0 compaction is independent of all other files, so it can be - // performed even when L0->base_level compactions are blocked. - // - // Returns true if `inputs` is populated with a span of files to be compacted; - // otherwise, returns false. - bool PickIntraL0Compaction(); - - void PickExpiredTtlFiles(); - - void PickFilesMarkedForPeriodicCompaction(); - - const std::string& cf_name_; - VersionStorageInfo* vstorage_; - CompactionPicker* compaction_picker_; - LogBuffer* log_buffer_; - int start_level_ = -1; - int output_level_ = -1; - int parent_index_ = -1; - int base_index_ = -1; - double start_level_score_ = 0; - bool is_manual_ = false; - CompactionInputFiles start_level_inputs_; - std::vector compaction_inputs_; - CompactionInputFiles output_level_inputs_; - std::vector grandparents_; - CompactionReason compaction_reason_ = CompactionReason::kUnknown; - - const MutableCFOptions& mutable_cf_options_; - const ImmutableCFOptions& ioptions_; - // Pick a path ID to place a newly generated file, with its level - static uint32_t GetPathId(const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, - int level); - - static const int kMinFilesForIntraL0Compaction = 4; -}; - -void LevelCompactionBuilder::PickExpiredTtlFiles() { - if (vstorage_->ExpiredTtlFiles().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { - // If it's being compacted it has nothing to do here. - // If this assert() fails that means that some function marked some - // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - start_level_ = level_file.first; - output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - - if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || - (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty())) { - return false; - } - - start_level_inputs_.files = {level_file.second}; - start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->ExpiredTtlFiles()) { - if (continuation(level_file)) { - // found the compaction! - return; - } - } - - start_level_inputs_.files.clear(); -} - -void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { - if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { - // If it's being compacted it has nothing to do here. - // If this assert() fails that means that some function marked some - // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - output_level_ = start_level_ = level_file.first; - - if (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty()) { - return false; - } - - start_level_inputs_.files = {level_file.second}; - start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { - if (continuation(level_file)) { - // found the compaction! - return; - } - } - - start_level_inputs_.files.clear(); -} - -void LevelCompactionBuilder::SetupInitialFiles() { - // Find the compactions by size on all levels. - bool skipped_l0_to_base = false; - for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { - start_level_score_ = vstorage_->CompactionScore(i); - start_level_ = vstorage_->CompactionScoreLevel(i); - assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); - if (start_level_score_ >= 1) { - if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { - // If L0->base_level compaction is pending, don't schedule further - // compaction from base level. Otherwise L0->base_level compaction - // may starve. - continue; - } - output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - if (PickFileToCompact()) { - // found the compaction! - if (start_level_ == 0) { - // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` - compaction_reason_ = CompactionReason::kLevelL0FilesNum; - } else { - // L1+ score = `Level files size` / `MaxBytesForLevel` - compaction_reason_ = CompactionReason::kLevelMaxLevelSize; - } - break; - } else { - // didn't find the compaction, clear the inputs - start_level_inputs_.clear(); - if (start_level_ == 0) { - skipped_l0_to_base = true; - // L0->base_level may be blocked due to ongoing L0->base_level - // compactions. It may also be blocked by an ongoing compaction from - // base_level downwards. - // - // In these cases, to reduce L0 file count and thus reduce likelihood - // of write stalls, we can attempt compacting a span of files within - // L0. - if (PickIntraL0Compaction()) { - output_level_ = 0; - compaction_reason_ = CompactionReason::kLevelL0FilesNum; - break; - } - } - } - } - } - - // if we didn't find a compaction, check if there are any files marked for - // compaction - if (start_level_inputs_.empty()) { - parent_index_ = base_index_ = -1; - - compaction_picker_->PickFilesMarkedForCompaction( - cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); - if (!start_level_inputs_.empty()) { - is_manual_ = true; - compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; - return; - } - } - - // Bottommost Files Compaction on deleting tombstones - if (start_level_inputs_.empty()) { - size_t i; - for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); - ++i) { - auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; - assert(!level_and_file.second->being_compacted); - start_level_inputs_.level = output_level_ = start_level_ = - level_and_file.first; - start_level_inputs_.files = {level_and_file.second}; - if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_)) { - break; - } - } - if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { - start_level_inputs_.clear(); - } else { - assert(!start_level_inputs_.empty()); - compaction_reason_ = CompactionReason::kBottommostFiles; - return; - } - } - - // TTL Compaction - if (start_level_inputs_.empty()) { - PickExpiredTtlFiles(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kTtl; - return; - } - } - - // Periodic Compaction - if (start_level_inputs_.empty()) { - PickFilesMarkedForPeriodicCompaction(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kPeriodicCompaction; - return; - } - } -} - -bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { - if (start_level_ == 0 && output_level_ != 0) { - return compaction_picker_->GetOverlappingL0Files( - vstorage_, &start_level_inputs_, output_level_, &parent_index_); - } - return true; -} - -bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { - // Setup input files from output level. For output to L0, we only compact - // spans of files that do not interact with any pending compactions, so don't - // need to consider other levels. - if (output_level_ != 0) { - output_level_inputs_.level = output_level_; - if (!compaction_picker_->SetupOtherInputs( - cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, - &output_level_inputs_, &parent_index_, base_index_)) { - return false; - } - - compaction_inputs_.push_back(start_level_inputs_); - if (!output_level_inputs_.empty()) { - compaction_inputs_.push_back(output_level_inputs_); - } - - // In some edge cases we could pick a compaction that will be compacting - // a key range that overlap with another running compaction, and both - // of them have the same output level. This could happen if - // (1) we are running a non-exclusive manual compaction - // (2) AddFile ingest a new file into the LSM tree - // We need to disallow this from happening. - if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, - output_level_)) { - // This compaction output could potentially conflict with the output - // of a currently running compaction, we cannot run it. - return false; - } - compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, - output_level_inputs_, &grandparents_); - } else { - compaction_inputs_.push_back(start_level_inputs_); - } - return true; -} - -Compaction* LevelCompactionBuilder::PickCompaction() { - // Pick up the first file to start compaction. It may have been extended - // to a clean cut. - SetupInitialFiles(); - if (start_level_inputs_.empty()) { - return nullptr; - } - assert(start_level_ >= 0 && output_level_ >= 0); - - // If it is a L0 -> base level compaction, we need to set up other L0 - // files if needed. - if (!SetupOtherL0FilesIfNeeded()) { - return nullptr; - } - - // Pick files in the output level and expand more files in the start level - // if needed. - if (!SetupOtherInputsIfNeeded()) { - return nullptr; - } - - // Form a compaction object containing the files we picked. - Compaction* c = GetCompaction(); - - TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); - - return c; -} - -Compaction* LevelCompactionBuilder::GetCompaction() { - auto c = new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), - output_level_, - MaxFileSizeForLevel(mutable_cf_options_, output_level_, - ioptions_.compaction_style, vstorage_->base_level(), - ioptions_.level_compaction_dynamic_level_bytes), - mutable_cf_options_.max_compaction_bytes, - GetPathId(ioptions_, mutable_cf_options_, output_level_), - GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, - output_level_, vstorage_->base_level()), - GetCompressionOptions(ioptions_, vstorage_, output_level_), - /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, - start_level_score_, false /* deletion_compaction */, compaction_reason_); - - // If it's level 0 compaction, make sure we don't execute any other level 0 - // compactions in parallel - compaction_picker_->RegisterCompaction(c); - - // Creating a compaction influences the compaction score because the score - // takes running compactions into account (by skipping files that are already - // being compacted). Since we just changed compaction score, we recalculate it - // here - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); - return c; -} - -/* - * Find the optimal path to place a file - * Given a level, finds the path where levels up to it will fit in levels - * up to and including this path - */ -uint32_t LevelCompactionBuilder::GetPathId( - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, int level) { - uint32_t p = 0; - assert(!ioptions.cf_paths.empty()); - - // size remaining in the most recent path - uint64_t current_path_size = ioptions.cf_paths[0].target_size; - - uint64_t level_size; - int cur_level = 0; - - // max_bytes_for_level_base denotes L1 size. - // We estimate L0 size to be the same as L1. - level_size = mutable_cf_options.max_bytes_for_level_base; - - // Last path is the fallback - while (p < ioptions.cf_paths.size() - 1) { - if (level_size <= current_path_size) { - if (cur_level == level) { - // Does desired level fit in this path? - return p; - } else { - current_path_size -= level_size; - if (cur_level > 0) { - if (ioptions.level_compaction_dynamic_level_bytes) { - // Currently, level_compaction_dynamic_level_bytes is ignored when - // multiple db paths are specified. https://github.com/facebook/ - // rocksdb/blob/master/db/column_family.cc. - // Still, adding this check to avoid accidentally using - // max_bytes_for_level_multiplier_additional - level_size = static_cast( - level_size * mutable_cf_options.max_bytes_for_level_multiplier); - } else { - level_size = static_cast( - level_size * mutable_cf_options.max_bytes_for_level_multiplier * - mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); - } - } - cur_level++; - continue; - } - } - p++; - current_path_size = ioptions.cf_paths[p].target_size; - } - return p; -} - -bool LevelCompactionBuilder::PickFileToCompact() { - // level 0 files are overlapping. So we cannot pick more - // than one concurrent compactions at this level. This - // could be made better by looking at key-ranges that are - // being compacted at level 0. - if (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty()) { - TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); - return false; - } - - start_level_inputs_.clear(); - - assert(start_level_ >= 0); - - // Pick the largest file in this level that is not already - // being compacted - const std::vector& file_size = - vstorage_->FilesByCompactionPri(start_level_); - const std::vector& level_files = - vstorage_->LevelFiles(start_level_); - - unsigned int cmp_idx; - for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); - cmp_idx < file_size.size(); cmp_idx++) { - int index = file_size[cmp_idx]; - auto* f = level_files[index]; - - // do not pick a file to compact if it is being compacted - // from n-1 level. - if (f->being_compacted) { - continue; - } - - start_level_inputs_.files.push_back(f); - start_level_inputs_.level = start_level_; - if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_) || - compaction_picker_->FilesRangeOverlapWithCompaction( - {start_level_inputs_}, output_level_)) { - // A locked (pending compaction) input-level file was pulled in due to - // user-key overlap. - start_level_inputs_.clear(); - continue; - } - - // Now that input level is fully expanded, we check whether any output files - // are locked due to pending compaction. - // - // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- - // level files are locked, not just the extra ones pulled in for user-key - // overlap. - InternalKey smallest, largest; - compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); - CompactionInputFiles output_level_inputs; - output_level_inputs.level = output_level_; - vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, - &output_level_inputs.files); - if (!output_level_inputs.empty() && - !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &output_level_inputs)) { - start_level_inputs_.clear(); - continue; - } - base_index_ = index; - break; - } - - // store where to start the iteration in the next call to PickCompaction - vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); - - return start_level_inputs_.size() > 0; -} - -bool LevelCompactionBuilder::PickIntraL0Compaction() { - start_level_inputs_.clear(); - const std::vector& level_files = - vstorage_->LevelFiles(0 /* level */); - if (level_files.size() < - static_cast( - mutable_cf_options_.level0_file_num_compaction_trigger + 2) || - level_files[0]->being_compacted) { - // If L0 isn't accumulating much files beyond the regular trigger, don't - // resort to L0->L0 compaction yet. - return false; - } - return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, - port::kMaxUint64, - mutable_cf_options_.max_compaction_bytes, - &start_level_inputs_); -} -} // namespace - -Compaction* LevelCompactionPicker::PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { - LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, - mutable_cf_options, ioptions_); - return builder.PickCompaction(); -} - } // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 05895a26753..437c8d30473 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -236,23 +236,6 @@ class CompactionPicker { const InternalKeyComparator* const icmp_; }; -// Picking compactions for leveled compaction. See wiki page -// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction -// for description of Leveled compaction. -class LevelCompactionPicker : public CompactionPicker { - public: - LevelCompactionPicker(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; - - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; -}; - #ifndef ROCKSDB_LITE // A dummy compaction that never triggers any automatic // compaction. diff --git a/db/compaction_picker_level.cc b/db/compaction_picker_level.cc new file mode 100644 index 00000000000..70fe46c5b81 --- /dev/null +++ b/db/compaction_picker_level.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction_picker_level.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include + +#include "test_util/sync_point.h" +#include "util/log_buffer.h" + +namespace rocksdb { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& ioptions) + : cf_name_(cf_name), + vstorage_(vstorage), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + void PickExpiredTtlFiles(); + + void PickFilesMarkedForPeriodicCompaction(); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + CompactionInputFiles start_level_inputs_; + std::vector compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableCFOptions& ioptions_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickExpiredTtlFiles() { + if (vstorage_->ExpiredTtlFiles().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + + if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->ExpiredTtlFiles()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { + if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + output_level_ = start_level_ = level_file.first; + + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + if (start_level_inputs_.empty()) { + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, + &start_level_inputs_); + if (!start_level_inputs_.empty()) { + is_manual_ = true; + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + } + + // Bottommost Files Compaction on deleting tombstones + if (start_level_inputs_.empty()) { + size_t i; + for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); + ++i) { + auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; + assert(!level_and_file.second->being_compacted); + start_level_inputs_.level = output_level_ = start_level_ = + level_and_file.first; + start_level_inputs_.files = {level_and_file.second}; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + break; + } + } + if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { + start_level_inputs_.clear(); + } else { + assert(!start_level_inputs_.empty()); + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + } + + // TTL Compaction + if (start_level_inputs_.empty()) { + PickExpiredTtlFiles(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + } + + // Periodic Compaction + if (start_level_inputs_.empty()) { + PickFilesMarkedForPeriodicCompaction(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + if (!compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, + output_level_)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), + output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level_, vstorage_->base_level()), + GetCompressionOptions(ioptions_, vstorage_, output_level_), + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + start_level_score_, false /* deletion_compaction */, compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/master/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + + assert(start_level_ >= 0); + + // Pick the largest file in this level that is not already + // being compacted + const std::vector& file_size = + vstorage_->FilesByCompactionPri(start_level_); + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_size.size(); cmp_idx++) { + int index = file_size[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + start_level_inputs_.files.push_back(f); + start_level_inputs_.level = start_level_; + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_)) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + continue; + } + + // Now that input level is fully expanded, we check whether any output files + // are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + continue; + } + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction( + level_files, kMinFilesForIntraL0Compaction, port::kMaxUint64, + mutable_cf_options_.max_compaction_bytes, &start_level_inputs_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, + mutable_cf_options, ioptions_); + return builder.PickCompaction(); +} +} // namespace rocksdb diff --git a/db/compaction_picker_level.h b/db/compaction_picker_level.h new file mode 100644 index 00000000000..1d37fe50eaf --- /dev/null +++ b/db/compaction_picker_level.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction_picker.h" + +namespace rocksdb { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace rocksdb diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index dd33009eb12..c3e9e450ff0 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -10,6 +10,7 @@ #include #include "db/compaction.h" #include "db/compaction_picker_fifo.h" +#include "db/compaction_picker_level.h" #include "db/compaction_picker_universal.h" #include "test_util/testharness.h" diff --git a/src.mk b/src.mk index c1ab36b8a61..44013bc2e1d 100644 --- a/src.mk +++ b/src.mk @@ -12,6 +12,7 @@ LIB_SOURCES = \ db/compaction_job.cc \ db/compaction_picker.cc \ db/compaction_picker_fifo.cc \ + db/compaction_picker_level.cc \ db/compaction_picker_universal.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ From 83f7a8eed0592cfe275ca5247069adb0acdf75d3 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 31 May 2019 08:24:05 -0700 Subject: [PATCH 092/572] Fix compilation error in LITE mode (#5391) Summary: Add macro ROCKSDB_LITE to fix compilation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5391 Differential Revision: D15574522 Pulled By: riversand963 fbshipit-source-id: 95aea83c5d9b2bf98a3ba0ef9167b63c9be2988b --- tools/db_bench_tool.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b98fb42c458..d80502f16fa 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2974,10 +2974,13 @@ class Benchmark { ->ToString() .c_str()); } + +#ifndef ROCKSDB_LITE if (FLAGS_use_secondary_db) { fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n", secondary_db_updates_); } +#endif // ROCKSDB_LITE } private: From 0834bbd0b108b7ddc66f963be657d6719515a687 Mon Sep 17 00:00:00 2001 From: qinzuoyan Date: Fri, 31 May 2019 10:40:39 -0700 Subject: [PATCH 093/572] Configure ccache in CMakeLists.txt to speed up compilation Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5388 Differential Revision: D15579052 Pulled By: siying fbshipit-source-id: ee58770fe023f40b9aa189a225e4c7ef50613ea9 --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ddea95deaf..9a4d9deb1b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,12 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + option(WITH_JEMALLOC "build with JeMalloc" OFF) option(WITH_SNAPPY "build with SNAPPY" OFF) option(WITH_LZ4 "build with lz4" OFF) From cb094e13bbadb4031ecab95e084418da60973312 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 31 May 2019 10:45:20 -0700 Subject: [PATCH 094/572] Auto roll logger to enforce options.keep_log_file_num immediately after a new file is created (#5370) Summary: Right now, with auto roll logger, options.keep_log_file_num enforcement is triggered by events like DB reopen or full obsolete scan happens. In the mean time, the size and number of log files can grow without a limit. We put a stronger enforcement to the option, so that the number of log files can always under control. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5370 Differential Revision: D15570413 Pulled By: siying fbshipit-source-id: 0916c3c4d42ab8fdd29389ee7fd7e1557b03176e --- HISTORY.md | 1 + file/filename.cc | 32 ++++ file/filename.h | 8 + util/auto_roll_logger.cc | 103 +++++++++++- util/auto_roll_logger.h | 33 ++-- util/auto_roll_logger_test.cc | 194 ++++++++++++++++++++--- utilities/convenience/info_log_finder.cc | 29 +--- 7 files changed, 328 insertions(+), 72 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index f645d5cc268..b9b6998c6f5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Partitions of partitioned indexes no longer affect the read amplification statistics. * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/file/filename.cc b/file/filename.cc index a8fb780054a..6f00d15ebca 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -407,4 +407,36 @@ Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, return file->Sync(db_options->use_fsync); } +Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, + const std::string& dbname, std::string* parent_dir, + std::vector* info_log_list) { + assert(parent_dir != nullptr); + assert(info_log_list != nullptr); + uint64_t number = 0; + FileType type; + + if (!db_log_dir.empty()) { + *parent_dir = db_log_dir; + } else { + *parent_dir = dbname; + } + + InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname); + + std::vector file_names; + Status s = env->GetChildren(*parent_dir, &file_names); + + if (!s.ok()) { + return s; + } + + for (auto& f : file_names) { + if (ParseFileName(f, &number, info_log_prefix.prefix, &type) && + (type == kInfoLogFile)) { + info_log_list->push_back(f); + } + } + return Status::OK(); +} + } // namespace rocksdb diff --git a/file/filename.h b/file/filename.h index eea6b1b02fd..db06f4664e2 100644 --- a/file/filename.h +++ b/file/filename.h @@ -169,4 +169,12 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname); extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, WritableFileWriter* file); +// Return list of file names of info logs in `file_names`. +// The list only contains file name. The parent directory name is stored +// in `parent_dir`. +// `db_log_dir` should be the one as in options.db_log_dir +extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, + const std::string& dbname, + std::string* parent_dir, + std::vector* file_names); } // namespace rocksdb diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc index ae6061aed43..9e8d6750319 100644 --- a/util/auto_roll_logger.cc +++ b/util/auto_roll_logger.cc @@ -4,12 +4,53 @@ // (found in the LICENSE.Apache file in the root directory). // #include "util/auto_roll_logger.h" +#include +#include "file/filename.h" +#include "util/logging.h" #include "util/mutexlock.h" namespace rocksdb { #ifndef ROCKSDB_LITE // -- AutoRollLogger + +AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, + size_t log_max_size, + size_t log_file_time_to_roll, + size_t keep_log_file_num, + const InfoLogLevel log_level) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + kKeepLogFileNum(keep_log_file_num), + cached_now(static_cast(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + Status s = env->GetAbsolutePath(dbname, &db_absolute_path_); + if (s.IsNotSupported()) { + db_absolute_path_ = dbname; + } else { + status_ = s; + } + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + if (env_->FileExists(log_fname_).ok()) { + RollLogFile(); + } + GetExistingFiles(); + ResetLogger(); + s = TrimOldLogFiles(); + if (!status_.ok()) { + status_ = s; + } +} + Status AutoRollLogger::ResetLogger() { TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); status_ = env_->NewLogger(log_fname_, &logger_); @@ -44,6 +85,58 @@ void AutoRollLogger::RollLogFile() { now++; } while (env_->FileExists(old_fname).ok()); env_->RenameFile(log_fname_, old_fname); + old_log_files_.push(old_fname); +} + +void AutoRollLogger::GetExistingFiles() { + { + // Empty the queue to avoid duplicated entries in the queue. + std::queue empty; + std::swap(old_log_files_, empty); + } + + std::string parent_dir; + std::vector info_log_files; + Status s = + GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files); + if (status_.ok()) { + status_ = s; + } + // We need to sort the file before enqueing it so that when we + // delete file from the front, it is the oldest file. + std::sort(info_log_files.begin(), info_log_files.end()); + + for (const std::string& f : info_log_files) { + old_log_files_.push(parent_dir + "/" + f); + } +} + +Status AutoRollLogger::TrimOldLogFiles() { + // Here we directly list info files and delete them through Env. + // The deletion isn't going through DB, so there are shortcomes: + // 1. the deletion is not rate limited by SstFileManager + // 2. there is a chance that an I/O will be issued here + // Since it's going to be complicated to pass DB object down to + // here, we take a simple approach to keep the code easier to + // maintain. + + // old_log_files_.empty() is helpful for the corner case that + // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but + // it's essentially the same thing, and checking empty before accessing + // the queue feels safer. + while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) { + Status s = env_->DeleteFile(old_log_files_.front()); + // Remove the file from the tracking anyway. It's possible that + // DB cleaned up the old log file, or people cleaned it up manually. + old_log_files_.pop(); + // To make the file really go away, we should sync parent directory. + // Since there isn't any consistency issue involved here, skipping + // this part to avoid one I/O here. + if (!s.ok()) { + return s; + } + } + return Status::OK(); } std::string AutoRollLogger::ValistToString(const char* format, @@ -78,12 +171,19 @@ void AutoRollLogger::Logv(const char* format, va_list ap) { (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { RollLogFile(); Status s = ResetLogger(); + Status s2 = TrimOldLogFiles(); + if (!s.ok()) { // can't really log the error if creating a new LOG file failed return; } WriteHeaderInfo(); + + if (!s2.ok()) { + ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s", + s2.ToString().c_str()); + } } // pin down the current logger_ instance before releasing the mutex. @@ -153,7 +253,8 @@ Status CreateLoggerFromOptions(const std::string& dbname, if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( env, dbname, options.db_log_dir, options.max_log_file_size, - options.log_file_time_to_roll, options.info_log_level); + options.log_file_time_to_roll, options.keep_log_file_num, + options.info_log_level); Status s = result->GetStatus(); if (!s.ok()) { delete result; diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index a5b2139fcaf..a14fbfd5892 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include "file/filename.h" @@ -24,25 +25,8 @@ class AutoRollLogger : public Logger { public: AutoRollLogger(Env* env, const std::string& dbname, const std::string& db_log_dir, size_t log_max_size, - size_t log_file_time_to_roll, - const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) - : Logger(log_level), - dbname_(dbname), - db_log_dir_(db_log_dir), - env_(env), - status_(Status::OK()), - kMaxLogFileSize(log_max_size), - kLogFileTimeToRoll(log_file_time_to_roll), - cached_now(static_cast(env_->NowMicros() * 1e-6)), - ctime_(cached_now), - cached_now_access_count(0), - call_NowMicros_every_N_records_(100), - mutex_() { - env->GetAbsolutePath(dbname, &db_absolute_path_); - log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); - RollLogFile(); - ResetLogger(); - } + size_t log_file_time_to_roll, size_t keep_log_file_num, + const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL); using Logger::Logv; void Logv(const char* format, va_list ap) override; @@ -110,6 +94,11 @@ class AutoRollLogger : public Logger { bool LogExpired(); Status ResetLogger(); void RollLogFile(); + // Read all names of old log files into old_log_files_ + // If there is any error, put the error code in status_ + void GetExistingFiles(); + // Delete old log files if it excceeds the limit. + Status TrimOldLogFiles(); // Log message to logger without rolling void LogInternal(const char* format, ...); // Serialize the va_list to a string @@ -126,8 +115,14 @@ class AutoRollLogger : public Logger { Status status_; const size_t kMaxLogFileSize; const size_t kLogFileTimeToRoll; + const size_t kKeepLogFileNum; // header information std::list headers_; + // List of all existing info log files. Used for enforcing number of + // info log files. + // Full path is stored here. It consumes signifianctly more memory + // than only storing file name. Can optimize if it causes a problem. + std::queue old_log_files_; // to avoid frequent env->NowMicros() calls, we cached the current time uint64_t cached_now; uint64_t ctime_; diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc index 87de5ed5b9f..ff47719d490 100644 --- a/util/auto_roll_logger_test.cc +++ b/util/auto_roll_logger_test.cc @@ -41,6 +41,21 @@ class NoSleepEnv : public EnvWrapper { }; } // namespace +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call ROCKS_LOG_INFO(logger, log_message) directly. +namespace { +void LogMessage(Logger* logger, const char* message) { + ROCKS_LOG_INFO(logger, "%s", message); +} + +void LogMessage(const InfoLogLevel log_level, Logger* logger, + const char* message) { + Log(log_level, logger, "%s", message); +} +} // namespace + class AutoRollLoggerTest : public testing::Test { public: static void InitTestDb() { @@ -62,6 +77,41 @@ class AutoRollLoggerTest : public testing::Test { const std::string& log_message); void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time, const std::string& log_message); + // return list of files under kTestDir that contains "LOG" + std::vector GetLogFiles() { + std::vector ret; + std::vector files; + Status s = default_env->GetChildren(kTestDir, &files); + // Should call ASSERT_OK() here but it doesn't compile. It's not + // worth the time figuring out why. + EXPECT_TRUE(s.ok()); + for (const auto& f : files) { + if (f.find("LOG") != std::string::npos) { + ret.push_back(f); + } + } + return ret; + } + + // Delete all log files under kTestDir + void CleanupLogFiles() { + for (const std::string& f : GetLogFiles()) { + ASSERT_OK(default_env->DeleteFile(kTestDir + "/" + f)); + } + } + + void RollNTimesBySize(Logger* auto_roll_logger, size_t file_num, + size_t max_log_file_size) { + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < file_num + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + } static const std::string kSampleMessage; static const std::string kTestDir; @@ -77,21 +127,6 @@ const std::string AutoRollLoggerTest::kLogFile( test::PerThreadDBPath("db_log_test") + "/LOG"); Env* AutoRollLoggerTest::default_env = Env::Default(); -// In this test we only want to Log some simple log message with -// no format. LogMessage() provides such a simple interface and -// avoids the [format-security] warning which occurs when you -// call ROCKS_LOG_INFO(logger, log_message) directly. -namespace { -void LogMessage(Logger* logger, const char* message) { - ROCKS_LOG_INFO(logger, "%s", message); -} - -void LogMessage(const InfoLogLevel log_level, Logger* logger, - const char* message) { - Log(log_level, logger, "%s", message); -} -} // namespace - void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, const std::string& log_message) { @@ -159,8 +194,10 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, TEST_F(AutoRollLoggerTest, RollLogFileBySize) { InitTestDb(); size_t log_max_size = 1024 * 5; + size_t keep_log_file_num = 10; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0, + keep_log_file_num); RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":RollLogFileBySize"); @@ -171,11 +208,12 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) { size_t time = 2; size_t log_size = 1024 * 5; + size_t keep_log_file_num = 10; InitTestDb(); // -- Test the existence of file during the server restart. ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); - AutoRollLogger logger(&nse, kTestDir, "", log_size, time); + AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num); ASSERT_OK(default_env->FileExists(kLogFile)); RollLogFileByTimeTest(&nse, &logger, time, @@ -192,28 +230,30 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) { // treated as "singed". size_t kZero = 0; size_t log_size = 1024; + size_t keep_log_file_num = 10; - AutoRollLogger* logger = new AutoRollLogger( - Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "", + log_size, 0, keep_log_file_num); LogMessage(logger, kSampleMessage.c_str()); ASSERT_GT(logger->GetLogFileSize(), kZero); delete logger; // reopens the log file and an empty log file will be created. - logger = new AutoRollLogger( - Env::Default(), kTestDir, "", log_size, 0); + logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10); ASSERT_EQ(logger->GetLogFileSize(), kZero); delete logger; } TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { size_t time = 2, log_max_size = 1024 * 5; + size_t keep_log_file_num = 10; InitTestDb(); NoSleepEnv nse(Env::Default()); - AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time); + AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time, + keep_log_file_num); // Test the ability to roll by size RollLogFileBySizeTest(&logger, log_max_size, @@ -269,6 +309,107 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { kSampleMessage + ":CreateLoggerFromOptions - both"); RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); + + // Set keep_log_file_num + { + const size_t kFileNum = 3; + InitTestDb(); + options.max_log_file_size = 512; + options.log_file_time_to_roll = 2; + options.keep_log_file_num = kFileNum; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + auto_roll_logger = dynamic_cast(logger.get()); + + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(options.max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < kFileNum + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + + std::vector files = GetLogFiles(); + ASSERT_EQ(kFileNum, files.size()); + + CleanupLogFiles(); + } + + // Set keep_log_file_num and dbname is different from + // db_log_dir. + { + const size_t kFileNum = 3; + InitTestDb(); + options.max_log_file_size = 512; + options.log_file_time_to_roll = 2; + options.keep_log_file_num = kFileNum; + options.db_log_dir = kTestDir; + ASSERT_OK(CreateLoggerFromOptions("/dummy/db/name", options, &logger)); + auto_roll_logger = dynamic_cast(logger.get()); + + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(options.max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < kFileNum + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + + std::vector files = GetLogFiles(); + ASSERT_EQ(kFileNum, files.size()); + for (const auto& f : files) { + ASSERT_TRUE(f.find("dummy") != std::string::npos); + } + + // Cleaning up those files. + CleanupLogFiles(); + } +} + +TEST_F(AutoRollLoggerTest, AutoDeleting) { + for (int attempt = 0; attempt < 2; attempt++) { + // In the first attemp, db_log_dir is not set, while in the + // second it is set. + std::string dbname = (attempt == 0) ? kTestDir : "/test/dummy/dir"; + std::string db_log_dir = (attempt == 0) ? "" : kTestDir; + + InitTestDb(); + const size_t kMaxFileSize = 512; + { + size_t log_num = 8; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + RollNTimesBySize(&logger, log_num, kMaxFileSize); + + ASSERT_EQ(log_num, GetLogFiles().size()); + } + // Shrink number of files + { + size_t log_num = 5; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + ASSERT_EQ(log_num, GetLogFiles().size()); + + RollNTimesBySize(&logger, 3, kMaxFileSize); + ASSERT_EQ(log_num, GetLogFiles().size()); + } + + // Increase number of files again. + { + size_t log_num = 7; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + ASSERT_EQ(6, GetLogFiles().size()); + + RollNTimesBySize(&logger, 3, kMaxFileSize); + ASSERT_EQ(log_num, GetLogFiles().size()); + } + + CleanupLogFiles(); + } } TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { @@ -322,7 +463,7 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) { // an extra-scope to force the AutoRollLogger to flush the log file when it // becomes out of scope. { - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -360,7 +501,7 @@ TEST_F(AutoRollLoggerTest, Close) { size_t log_size = 8192; size_t log_lines = 0; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -446,8 +587,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { InitTestDb(); - AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "", - LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0); + AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"", + LOG_MAX_SIZE, /*log_file_time_to_roll=*/0, + /*keep_log_file_num=*/10); if (test_num == 0) { // Log some headers explicitly using Header() diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc index 3e599961630..646362aa2c2 100644 --- a/utilities/convenience/info_log_finder.cc +++ b/utilities/convenience/info_log_finder.cc @@ -14,35 +14,12 @@ namespace rocksdb { Status GetInfoLogList(DB* db, std::vector* info_log_list) { - uint64_t number = 0; - FileType type; - std::string path; - if (!db) { return Status::InvalidArgument("DB pointer is not valid"); } - + std::string parent_path; const Options& options = db->GetOptions(); - if (!options.db_log_dir.empty()) { - path = options.db_log_dir; - } else { - path = db->GetName(); - } - InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), db->GetName()); - auto* env = options.env; - std::vector file_names; - Status s = env->GetChildren(path, &file_names); - - if (!s.ok()) { - return s; - } - - for (auto f : file_names) { - if (ParseFileName(f, &number, info_log_prefix.prefix, &type) && - (type == kInfoLogFile)) { - info_log_list->push_back(f); - } - } - return Status::OK(); + return GetInfoLogFiles(options.env, options.db_log_dir, db->GetName(), + &parent_path, info_log_list); } } // namespace rocksdb From a3609b7dde4b8a37602c74d5cf08a502a067198e Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 31 May 2019 11:37:21 -0700 Subject: [PATCH 095/572] Improve const correctness in BlockBasedTableReader (#5383) Summary: Many methods are passing around pointers to non-const objects when in fact they do not/should not modify said objects. The patch makes the semantics clearer and also helps from a thread safety point-of-view by changing some pointers to pointers-to-const and marking some instance methods as const. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5383 Differential Revision: D15562770 Pulled By: ltamasi fbshipit-source-id: 89361dadbb8b25bbe54d17e8da28fee24a2419af --- table/block_based/block_based_table_reader.cc | 36 ++++++++++--------- table/block_based/block_based_table_reader.h | 26 ++++++-------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 944a1fde43e..b7fba779f47 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -203,19 +203,20 @@ bool PrefixExtractorChanged(const TableProperties* table_properties, // in the cache or not. class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { public: - IndexReaderCommon(BlockBasedTable* t, CachableEntry&& index_block) + IndexReaderCommon(const BlockBasedTable* t, + CachableEntry&& index_block) : table_(t), index_block_(std::move(index_block)) { assert(table_ != nullptr); } protected: - static Status ReadIndexBlock(BlockBasedTable* table, + static Status ReadIndexBlock(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, GetContext* get_context, CachableEntry* index_block); - BlockBasedTable* table() const { return table_; } + const BlockBasedTable* table() const { return table_; } const InternalKeyComparator* internal_comparator() const { assert(table_ != nullptr); @@ -256,12 +257,12 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { } private: - BlockBasedTable* table_; + const BlockBasedTable* table_; CachableEntry index_block_; }; Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( - BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, GetContext* get_context, CachableEntry* index_block) { PERF_TIMER_GUARD(read_index_block_nanos); @@ -304,7 +305,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // `PartitionIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(BlockBasedTable* table, + static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, bool pin, IndexReader** index_reader) { assert(table != nullptr); @@ -473,7 +474,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } private: - PartitionIndexReader(BlockBasedTable* t, CachableEntry&& index_block) + PartitionIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) : IndexReaderCommon(t, std::move(index_block)) {} std::unordered_map> partition_map_; @@ -488,7 +490,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { // `BinarySearchIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(BlockBasedTable* table, + static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, bool pin, IndexReader** index_reader) { assert(table != nullptr); @@ -553,7 +555,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { } private: - BinarySearchIndexReader(BlockBasedTable* t, + BinarySearchIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) : IndexReaderCommon(t, std::move(index_block)) {} }; @@ -562,7 +564,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { // key. class HashIndexReader : public BlockBasedTable::IndexReaderCommon { public: - static Status Create(BlockBasedTable* table, + static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_index_iter, bool use_cache, bool prefetch, bool pin, IndexReader** index_reader) { @@ -699,7 +701,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { } private: - HashIndexReader(BlockBasedTable* t, CachableEntry&& index_block) + HashIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) : IndexReaderCommon(t, std::move(index_block)) {} std::unique_ptr prefix_index_; @@ -1188,7 +1190,7 @@ Status BlockBasedTable::ReadRangeDelBlock( } Status BlockBasedTable::ReadCompressionDictBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, + const Rep* rep, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* compression_dict_block) { assert(compression_dict_block != nullptr); Status s; @@ -1842,7 +1844,7 @@ CachableEntry BlockBasedTable::GetFilter( } CachableEntry -BlockBasedTable::GetUncompressionDict(Rep* rep, +BlockBasedTable::GetUncompressionDict(const Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context) { if (!rep->table_options.cache_index_and_filter_blocks) { @@ -1925,7 +1927,7 @@ BlockBasedTable::GetUncompressionDict(Rep* rep, // differs from the one in mutable_cf_options and index type is HashBasedIndex InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* input_iter, GetContext* get_context) { + IndexBlockIter* input_iter, GetContext* get_context) const { assert(rep_ != nullptr); assert(rep_->index_reader != nullptr); @@ -1941,7 +1943,7 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( // If input_iter is not null, update this iter and return it template TBlockIter* BlockBasedTable::NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const BlockHandle& handle, + const Rep* rep, const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, bool is_index, bool key_includes_seq, bool index_key_is_full, GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) { @@ -2164,7 +2166,7 @@ Status BlockBasedTable::RetrieveBlock( } BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( - BlockBasedTable* table, + const BlockBasedTable* table, std::unordered_map>* block_map, bool index_key_includes_seq, bool index_key_is_full) : table_(table), @@ -2214,7 +2216,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( bool BlockBasedTable::PrefixMayMatch( const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check) { + const bool need_upper_bound_check) const { if (!rep_->filter_policy) { return true; } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 3af617fecfa..f6f610ca2ac 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -112,7 +112,7 @@ class BlockBasedTable : public TableReader { bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check); + const bool need_upper_bound_check) const; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -215,18 +215,12 @@ class BlockBasedTable : public TableReader { struct Rep; Rep* get_rep() { return rep_; } + const Rep* get_rep() const { return rep_; } // input_iter: if it is not null, update this one and return it as Iterator template static TBlockIter* NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const Slice& index_value, - TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, bool index_key_is_full = true, - GetContext* get_context = nullptr, - FilePrefetchBuffer* prefetch_buffer = nullptr); - template - static TBlockIter* NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, + const Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, TBlockIter* input_iter = nullptr, bool is_index = false, bool key_includes_seq = true, bool index_key_is_full = true, GetContext* get_context = nullptr, Status s = Status(), @@ -283,7 +277,7 @@ class BlockBasedTable : public TableReader { const SliceTransform* prefix_extractor = nullptr) const; static CachableEntry GetUncompressionDict( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, + const Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context); // Get the iterator from the index reader. @@ -299,7 +293,7 @@ class BlockBasedTable : public TableReader { InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check = false, IndexBlockIter* input_iter = nullptr, - GetContext* get_context = nullptr); + GetContext* get_context = nullptr) const; // Read block cache from block caches (if set): block_cache and // block_cache_compressed. @@ -386,7 +380,7 @@ class BlockBasedTable : public TableReader { InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator); static Status ReadCompressionDictBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, + const Rep* rep, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* compression_dict_block); static Status PrefetchIndexAndFilterBlocks( Rep* rep, FilePrefetchBuffer* prefetch_buffer, @@ -430,7 +424,7 @@ class BlockBasedTable::PartitionedIndexIteratorState : public TwoLevelIteratorState { public: PartitionedIndexIteratorState( - BlockBasedTable* table, + const BlockBasedTable* table, std::unordered_map>* block_map, const bool index_key_includes_seq, const bool index_key_is_full); InternalIteratorBase* NewSecondaryIterator( @@ -438,7 +432,7 @@ class BlockBasedTable::PartitionedIndexIteratorState private: // Don't own table_ - BlockBasedTable* table_; + const BlockBasedTable* table_; std::unordered_map>* block_map_; bool index_key_includes_seq_; bool index_key_is_full_; @@ -561,7 +555,7 @@ struct BlockBasedTable::Rep { template class BlockBasedTableIterator : public InternalIteratorBase { public: - BlockBasedTableIterator(BlockBasedTable* table, + BlockBasedTableIterator(const BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, InternalIteratorBase* index_iter, @@ -681,7 +675,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void CheckOutOfBound(); private: - BlockBasedTable* table_; + const BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; From 49c5a12dbee3aa65907e772b254d753c6d391da1 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Fri, 31 May 2019 11:52:59 -0700 Subject: [PATCH 096/572] Organizing rocksdb/db directory Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5390 Differential Revision: D15579388 Pulled By: vjnadimpalli fbshipit-source-id: 5bfc95e31554b8ff05b97b76d6534113f527f366 --- CMakeLists.txt | 42 +++++++++---------- Makefile | 10 ++--- TARGETS | 42 +++++++++---------- db/builder.cc | 2 +- db/column_family.cc | 10 ++--- db/column_family_test.cc | 2 +- db/compact_files_test.cc | 2 +- db/compacted_db_impl.cc | 2 +- db/compacted_db_impl.h | 2 +- db/{ => compaction}/compaction.cc | 3 +- db/{ => compaction}/compaction.h | 0 .../compaction_iteration_stats.h | 0 db/{ => compaction}/compaction_iterator.cc | 3 +- db/{ => compaction}/compaction_iterator.h | 4 +- .../compaction_iterator_test.cc | 2 +- db/{ => compaction}/compaction_job.cc | 4 +- db/{ => compaction}/compaction_job.h | 2 +- .../compaction_job_stats_test.cc | 2 +- db/{ => compaction}/compaction_job_test.cc | 2 +- db/{ => compaction}/compaction_picker.cc | 2 +- db/{ => compaction}/compaction_picker.h | 2 +- db/{ => compaction}/compaction_picker_fifo.cc | 2 +- db/{ => compaction}/compaction_picker_fifo.h | 2 +- .../compaction_picker_level.cc | 3 +- db/{ => compaction}/compaction_picker_level.h | 2 +- db/{ => compaction}/compaction_picker_test.cc | 9 ++-- .../compaction_picker_universal.cc | 2 +- .../compaction_picker_universal.h | 2 +- db/convenience.cc | 2 +- db/corruption_test.cc | 2 +- db/cuckoo_table_db_test.cc | 2 +- db/db_filesnapshot.cc | 2 +- db/{ => db_impl}/db_impl.cc | 4 +- db/{ => db_impl}/db_impl.h | 4 +- db/{ => db_impl}/db_impl_compaction_flush.cc | 2 +- db/{ => db_impl}/db_impl_debug.cc | 2 +- db/{ => db_impl}/db_impl_experimental.cc | 2 +- db/{ => db_impl}/db_impl_files.cc | 2 +- db/{ => db_impl}/db_impl_open.cc | 2 +- db/{ => db_impl}/db_impl_readonly.cc | 4 +- db/{ => db_impl}/db_impl_readonly.h | 2 +- db/{ => db_impl}/db_impl_secondary.cc | 2 +- db/{ => db_impl}/db_impl_secondary.h | 2 +- db/{ => db_impl}/db_impl_write.cc | 2 +- db/{ => db_impl}/db_secondary_test.cc | 2 +- db/db_iter.h | 2 +- db/db_options_test.cc | 2 +- db/db_test.cc | 2 +- db/db_test_util.h | 2 +- db/deletefile_test.cc | 2 +- db/error_handler.cc | 2 +- db/experimental.cc | 2 +- db/fault_injection_test.cc | 2 +- db/forward_iterator.cc | 2 +- db/in_memory_stats_history.cc | 2 +- db/internal_stats.cc | 2 +- db/listener_test.cc | 2 +- db/memtable_list.cc | 2 +- db/merge_test.cc | 5 ++- db/obsolete_files_test.cc | 2 +- db/options_file_test.cc | 2 +- db/plain_table_db_test.cc | 2 +- db/prefix_test.cc | 2 +- db/range_del_aggregator.cc | 2 +- db/range_del_aggregator.h | 2 +- db/repair.cc | 2 +- db/repair_test.cc | 2 +- db/table_properties_collector_test.cc | 2 +- db/version_set.cc | 2 +- db/version_set.h | 4 +- db/wal_manager_test.cc | 2 +- db/write_batch.cc | 2 +- db/write_callback_test.cc | 2 +- file/sst_file_manager_impl.cc | 2 +- file/sst_file_manager_impl.h | 2 +- src.mk | 42 +++++++++---------- table/table_reader_bench.cc | 2 +- tools/db_bench_tool.cc | 2 +- tools/db_stress.cc | 2 +- tools/ldb_cmd.cc | 3 +- tools/reduce_levels_test.cc | 2 +- tools/trace_analyzer_tool.cc | 2 +- util/trace_replay.cc | 3 +- utilities/backupable/backupable_db_test.cc | 2 +- utilities/blob_db/blob_db_impl.cc | 2 +- utilities/blob_db/blob_file.cc | 3 +- .../cassandra/cassandra_functional_test.cc | 2 +- utilities/checkpoint/checkpoint_test.cc | 2 +- utilities/debug.cc | 2 +- utilities/memory/memory_test.cc | 2 +- utilities/memory/memory_util.cc | 2 +- .../transactions/optimistic_transaction.cc | 2 +- .../optimistic_transaction_db_impl.cc | 2 +- .../transactions/pessimistic_transaction.cc | 2 +- .../pessimistic_transaction_db.cc | 2 +- utilities/transactions/transaction_base.cc | 2 +- utilities/transactions/transaction_test.cc | 2 +- utilities/transactions/transaction_test.h | 2 +- utilities/transactions/transaction_util.cc | 2 +- .../write_prepared_transaction_test.cc | 2 +- utilities/transactions/write_prepared_txn.cc | 2 +- .../transactions/write_prepared_txn_db.cc | 2 +- .../transactions/write_unprepared_txn.cc | 2 +- utilities/ttl/db_ttl_impl.h | 2 +- .../write_batch_with_index.cc | 2 +- 105 files changed, 186 insertions(+), 184 deletions(-) rename db/{ => compaction}/compaction.cc (99%) rename db/{ => compaction}/compaction.h (100%) rename db/{ => compaction}/compaction_iteration_stats.h (100%) rename db/{ => compaction}/compaction_iterator.cc (99%) rename db/{ => compaction}/compaction_iterator.h (99%) rename db/{ => compaction}/compaction_iterator_test.cc (99%) rename db/{ => compaction}/compaction_job.cc (99%) rename db/{ => compaction}/compaction_job.h (99%) rename db/{ => compaction}/compaction_job_stats_test.cc (99%) rename db/{ => compaction}/compaction_job_test.cc (99%) rename db/{ => compaction}/compaction_picker.cc (99%) rename db/{ => compaction}/compaction_picker.h (99%) rename db/{ => compaction}/compaction_picker_fifo.cc (99%) rename db/{ => compaction}/compaction_picker_fifo.h (98%) rename db/{ => compaction}/compaction_picker_level.cc (99%) rename db/{ => compaction}/compaction_picker_level.h (96%) rename db/{ => compaction}/compaction_picker_test.cc (99%) rename db/{ => compaction}/compaction_picker_universal.cc (99%) rename db/{ => compaction}/compaction_picker_universal.h (98%) rename db/{ => db_impl}/db_impl.cc (99%) rename db/{ => db_impl}/db_impl.h (99%) rename db/{ => db_impl}/db_impl_compaction_flush.cc (99%) rename db/{ => db_impl}/db_impl_debug.cc (99%) rename db/{ => db_impl}/db_impl_experimental.cc (99%) rename db/{ => db_impl}/db_impl_files.cc (99%) rename db/{ => db_impl}/db_impl_open.cc (99%) rename db/{ => db_impl}/db_impl_readonly.cc (99%) rename db/{ => db_impl}/db_impl_readonly.h (99%) rename db/{ => db_impl}/db_impl_secondary.cc (99%) rename db/{ => db_impl}/db_impl_secondary.h (99%) rename db/{ => db_impl}/db_impl_write.cc (99%) rename db/{ => db_impl}/db_secondary_test.cc (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a4d9deb1b6..4c2fa7119c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -486,24 +486,24 @@ set(SOURCES db/c.cc db/column_family.cc db/compacted_db_impl.cc - db/compaction.cc - db/compaction_iterator.cc - db/compaction_job.cc - db/compaction_picker.cc - db/compaction_picker_fifo.cc - db/compaction_picker_level.cc - db/compaction_picker_universal.cc + db/compaction/compaction.cc + db/compaction/compaction_iterator.cc + db/compaction/compaction_picker.cc + db/compaction/compaction_job.cc + db/compaction/compaction_picker_fifo.cc + db/compaction/compaction_picker_level.cc + db/compaction/compaction_picker_universal.cc db/convenience.cc db/db_filesnapshot.cc - db/db_impl.cc - db/db_impl_write.cc - db/db_impl_compaction_flush.cc - db/db_impl_files.cc - db/db_impl_open.cc - db/db_impl_debug.cc - db/db_impl_experimental.cc - db/db_impl_readonly.cc - db/db_impl_secondary.cc + db/db_impl/db_impl.cc + db/db_impl/db_impl_write.cc + db/db_impl/db_impl_compaction_flush.cc + db/db_impl/db_impl_files.cc + db/db_impl/db_impl_open.cc + db/db_impl/db_impl_debug.cc + db/db_impl/db_impl_experimental.cc + db/db_impl/db_impl_readonly.cc + db/db_impl/db_impl_secondary.cc db/db_info_dumper.cc db/db_iter.cc db/dbformat.cc @@ -868,10 +868,10 @@ if(WITH_TESTS) cache/lru_cache_test.cc db/column_family_test.cc db/compact_files_test.cc - db/compaction_iterator_test.cc - db/compaction_job_stats_test.cc - db/compaction_job_test.cc - db/compaction_picker_test.cc + db/compaction/compaction_job_stats_test.cc + db/compaction/compaction_job_test.cc + db/compaction/compaction_iterator_test.cc + db/compaction/compaction_picker_test.cc db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc @@ -894,7 +894,7 @@ if(WITH_TESTS) db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc - db/db_secondary_test.cc + db/db_impl/db_secondary_test.cc db/db_sst_test.cc db/db_statistics_test.cc db/db_table_properties_test.cc diff --git a/Makefile b/Makefile index 244b929c418..5181154a212 100644 --- a/Makefile +++ b/Makefile @@ -1339,13 +1339,13 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_iterator_test: db/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_job_stats_test: db/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1417,7 +1417,7 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1585,7 +1585,7 @@ range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) #------------------------------------------------- diff --git a/TARGETS b/TARGETS index dc39f87bcef..edddc7b99be 100644 --- a/TARGETS +++ b/TARGETS @@ -83,24 +83,24 @@ cpp_library( "db/c.cc", "db/column_family.cc", "db/compacted_db_impl.cc", - "db/compaction.cc", - "db/compaction_iterator.cc", - "db/compaction_job.cc", - "db/compaction_picker.cc", - "db/compaction_picker_fifo.cc", - "db/compaction_picker_level.cc", - "db/compaction_picker_universal.cc", + "db/compaction/compaction.cc", + "db/compaction/compaction_iterator.cc", + "db/compaction/compaction_job.cc", + "db/compaction/compaction_picker.cc", + "db/compaction/compaction_picker_fifo.cc", + "db/compaction/compaction_picker_level.cc", + "db/compaction/compaction_picker_universal.cc", "db/convenience.cc", "db/db_filesnapshot.cc", - "db/db_impl.cc", - "db/db_impl_compaction_flush.cc", - "db/db_impl_debug.cc", - "db/db_impl_experimental.cc", - "db/db_impl_files.cc", - "db/db_impl_open.cc", - "db/db_impl_readonly.cc", - "db/db_impl_secondary.cc", - "db/db_impl_write.cc", + "db/db_impl/db_impl.cc", + "db/db_impl/db_impl_compaction_flush.cc", + "db/db_impl/db_impl_debug.cc", + "db/db_impl/db_impl_experimental.cc", + "db/db_impl/db_impl_files.cc", + "db/db_impl/db_impl_open.cc", + "db/db_impl/db_impl_readonly.cc", + "db/db_impl/db_impl_secondary.cc", + "db/db_impl/db_impl_write.cc", "db/db_info_dumper.cc", "db/db_iter.cc", "db/dbformat.cc", @@ -454,22 +454,22 @@ ROCKS_TESTS = [ ], [ "compaction_iterator_test", - "db/compaction_iterator_test.cc", + "db/compaction/compaction_iterator_test.cc", "serial", ], [ "compaction_job_stats_test", - "db/compaction_job_stats_test.cc", + "db/compaction/compaction_job_stats_test.cc", "serial", ], [ "compaction_job_test", - "db/compaction_job_test.cc", + "db/compaction/compaction_job_test.cc", "serial", ], [ "compaction_picker_test", - "db/compaction_picker_test.cc", + "db/compaction/compaction_picker_test.cc", "serial", ], [ @@ -609,7 +609,7 @@ ROCKS_TESTS = [ ], [ "db_secondary_test", - "db/db_secondary_test.cc", + "db/db_impl/db_secondary_test.cc", "serial", ], [ diff --git a/db/builder.cc b/db/builder.cc index 86aac02ab74..67d764ad18b 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -13,7 +13,7 @@ #include #include -#include "db/compaction_iterator.h" +#include "db/compaction/compaction_iterator.h" #include "db/dbformat.h" #include "db/event_helpers.h" #include "db/internal_stats.h" diff --git a/db/column_family.cc b/db/column_family.cc index fde1996aeaf..ce22a00aac3 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -19,11 +19,11 @@ #include #include -#include "db/compaction_picker.h" -#include "db/compaction_picker_fifo.h" -#include "db/compaction_picker_level.h" -#include "db/compaction_picker_universal.h" -#include "db/db_impl.h" +#include "db/compaction/compaction_picker.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/db_impl/db_impl.h" #include "db/internal_stats.h" #include "db/job_context.h" #include "db/range_del_aggregator.h" diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 21b3321bea6..9374a135866 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -12,8 +12,8 @@ #include #include -#include "db/db_impl.h" #include "db/db_test_util.h" +#include "db/db_impl/db_impl.h" #include "memtable/hash_skiplist_rep.h" #include "options/options_parser.h" #include "port/port.h" diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 438fdb7c96f..92975da87c1 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -10,7 +10,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc index acdaad4ec29..88928391ad2 100644 --- a/db/compacted_db_impl.cc +++ b/db/compacted_db_impl.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE #include "db/compacted_db_impl.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "table/get_context.h" diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index 5c574b4b9a5..8c1a1428c81 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -5,7 +5,7 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include #include diff --git a/db/compaction.cc b/db/compaction/compaction.cc similarity index 99% rename from db/compaction.cc rename to db/compaction/compaction.cc index 089dd66848e..5dc7e83c8fc 100644 --- a/db/compaction.cc +++ b/db/compaction/compaction.cc @@ -7,8 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction.h" - #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif @@ -17,6 +15,7 @@ #include #include "db/column_family.h" +#include "db/compaction/compaction.h" #include "rocksdb/compaction_filter.h" #include "test_util/sync_point.h" #include "util/string_util.h" diff --git a/db/compaction.h b/db/compaction/compaction.h similarity index 100% rename from db/compaction.h rename to db/compaction/compaction.h diff --git a/db/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h similarity index 100% rename from db/compaction_iteration_stats.h rename to db/compaction/compaction_iteration_stats.h diff --git a/db/compaction_iterator.cc b/db/compaction/compaction_iterator.cc similarity index 99% rename from db/compaction_iterator.cc rename to db/compaction/compaction_iterator.cc index 7e060969962..135018f5148 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_iterator.h" - +#include "db/compaction/compaction_iterator.h" #include "db/snapshot_checker.h" #include "port/likely.h" #include "rocksdb/listener.h" diff --git a/db/compaction_iterator.h b/db/compaction/compaction_iterator.h similarity index 99% rename from db/compaction_iterator.h rename to db/compaction/compaction_iterator.h index 6ab43b1becf..9744ab8dfc8 100644 --- a/db/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -10,8 +10,8 @@ #include #include -#include "db/compaction.h" -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/compaction/compaction.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" diff --git a/db/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc similarity index 99% rename from db/compaction_iterator_test.cc rename to db/compaction/compaction_iterator_test.cc index 99bb026b5a9..ddda79a4cfe 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -3,11 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_iterator.h" #include #include +#include "db/compaction/compaction_iterator.h" #include "port/port.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/db/compaction_job.cc b/db/compaction/compaction_job.cc similarity index 99% rename from db/compaction_job.cc rename to db/compaction/compaction_job.cc index 92a6fab8da8..3866d70ee00 100644 --- a/db/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_job.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS @@ -24,8 +23,9 @@ #include #include +#include "db/compaction/compaction_job.h" #include "db/builder.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/error_handler.h" diff --git a/db/compaction_job.h b/db/compaction/compaction_job.h similarity index 99% rename from db/compaction_job.h rename to db/compaction/compaction_job.h index 0751727d704..1387fffb1c1 100644 --- a/db/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -18,7 +18,7 @@ #include #include "db/column_family.h" -#include "db/compaction_iterator.h" +#include "db/compaction/compaction_iterator.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" diff --git a/db/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc similarity index 99% rename from db/compaction_job_stats_test.cc rename to db/compaction/compaction_job_stats_test.cc index 35c1100f99b..91310e9f112 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -21,7 +21,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/job_context.h" #include "db/version_set.h" diff --git a/db/compaction_job_test.cc b/db/compaction/compaction_job_test.cc similarity index 99% rename from db/compaction_job_test.cc rename to db/compaction/compaction_job_test.cc index 93e55b7a03b..838cda5eaca 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -17,7 +17,7 @@ #include #include "db/column_family.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" #include "db/error_handler.h" #include "db/version_set.h" #include "rocksdb/cache.h" diff --git a/db/compaction_picker.cc b/db/compaction/compaction_picker.cc similarity index 99% rename from db/compaction_picker.cc rename to db/compaction/compaction_picker.cc index bfe13828b18..4276ea9cb41 100644 --- a/db/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/compaction_picker.h b/db/compaction/compaction_picker.h similarity index 99% rename from db/compaction_picker.h rename to db/compaction/compaction_picker.h index 437c8d30473..53477014cf6 100644 --- a/db/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -15,7 +15,7 @@ #include #include -#include "db/compaction.h" +#include "db/compaction/compaction.h" #include "db/version_set.h" #include "options/cf_options.h" #include "rocksdb/env.h" diff --git a/db/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc similarity index 99% rename from db/compaction_picker_fifo.cc rename to db/compaction/compaction_picker_fifo.cc index eadb31f9ee5..ffb5a9f6495 100644 --- a/db/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_fifo.h" #ifndef ROCKSDB_LITE #ifndef __STDC_FORMAT_MACROS diff --git a/db/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h similarity index 98% rename from db/compaction_picker_fifo.h rename to db/compaction/compaction_picker_fifo.h index 9da107c5d4a..a4e63803cf8 100644 --- a/db/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -10,7 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" namespace rocksdb { class FIFOCompactionPicker : public CompactionPicker { diff --git a/db/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc similarity index 99% rename from db/compaction_picker_level.cc rename to db/compaction/compaction_picker_level.cc index 70fe46c5b81..aeb368ea20a 100644 --- a/db/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -7,8 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker_level.h" - #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif @@ -17,6 +15,7 @@ #include #include +#include "db/compaction/compaction_picker_level.h" #include "test_util/sync_point.h" #include "util/log_buffer.h" diff --git a/db/compaction_picker_level.h b/db/compaction/compaction_picker_level.h similarity index 96% rename from db/compaction_picker_level.h rename to db/compaction/compaction_picker_level.h index 1d37fe50eaf..9fc196698a1 100644 --- a/db/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -9,7 +9,7 @@ #pragma once -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" namespace rocksdb { // Picking compactions for leveled compaction. See wiki page diff --git a/db/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc similarity index 99% rename from db/compaction_picker_test.cc rename to db/compaction/compaction_picker_test.cc index c3e9e450ff0..bab93227a4f 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -3,15 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_picker.h" #include #include #include -#include "db/compaction.h" -#include "db/compaction_picker_fifo.h" -#include "db/compaction_picker_level.h" -#include "db/compaction_picker_universal.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/db/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc similarity index 99% rename from db/compaction_picker_universal.cc rename to db/compaction/compaction_picker_universal.cc index 20edd30748d..465245715fd 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker_universal.h" +#include "db/compaction/compaction_picker_universal.h" #ifndef ROCKSDB_LITE #ifndef __STDC_FORMAT_MACROS diff --git a/db/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h similarity index 98% rename from db/compaction_picker_universal.h rename to db/compaction/compaction_picker_universal.h index 375e5998e25..2c44735d95f 100644 --- a/db/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -10,7 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" namespace rocksdb { class UniversalCompactionPicker : public CompactionPicker { diff --git a/db/convenience.cc b/db/convenience.cc index 71c237f60c0..c11653fb190 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -8,7 +8,7 @@ #include "rocksdb/convenience.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "util/cast_util.h" namespace rocksdb { diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 53c4d42d28a..9e83c9080e6 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -16,7 +16,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/log_format.h" #include "db/version_set.h" #include "file/filename.h" diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 6f60e2d7037..135a34c2e09 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "table/cuckoo/cuckoo_table_factory.h" diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 59757aeb9f7..ac544793ee4 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -14,7 +14,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/job_context.h" #include "db/version_set.h" #include "file/file_util.h" diff --git a/db/db_impl.cc b/db/db_impl/db_impl.cc similarity index 99% rename from db/db_impl.cc rename to db/db_impl/db_impl.cc index 5534c225f4d..196e38f14fa 100644 --- a/db/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS @@ -28,7 +28,7 @@ #include #include "db/builder.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" #include "db/db_info_dumper.h" #include "db/db_iter.h" #include "db/dbformat.h" diff --git a/db/db_impl.h b/db/db_impl/db_impl.h similarity index 99% rename from db/db_impl.h rename to db/db_impl/db_impl.h index 4c418d6f38f..27d39f90d24 100644 --- a/db/db_impl.h +++ b/db/db_impl/db_impl.h @@ -20,7 +20,7 @@ #include #include "db/column_family.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" #include "db/dbformat.h" #include "db/error_handler.h" #include "db/event_helpers.h" @@ -39,7 +39,7 @@ #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "memtable_list.h" +#include "db/memtable_list.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc similarity index 99% rename from db/db_impl_compaction_flush.cc rename to db/db_impl/db_impl_compaction_flush.cc index c6025a8cc57..881fa26af37 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc similarity index 99% rename from db/db_impl_debug.cc rename to db/db_impl/db_impl_debug.cc index f558971190e..4b558facb37 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -9,7 +9,7 @@ #ifndef NDEBUG -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "monitoring/thread_status_updater.h" diff --git a/db/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc similarity index 99% rename from db/db_impl_experimental.cc rename to db/db_impl/db_impl_experimental.cc index 47a880199e2..a8fed40be01 100644 --- a/db/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_impl_files.cc b/db/db_impl/db_impl_files.cc similarity index 99% rename from db/db_impl_files.cc rename to db/db_impl/db_impl_files.cc index 64c6dc96879..608c8ce4948 100644 --- a/db/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_impl_open.cc b/db/db_impl/db_impl_open.cc similarity index 99% rename from db/db_impl_open.cc rename to db/db_impl/db_impl_open.cc index 5dae140c7ea..5019221b5ca 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc similarity index 99% rename from db/db_impl_readonly.cc rename to db/db_impl/db_impl_readonly.cc index 5d7515c28e2..55249228456 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -3,12 +3,12 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl_readonly.h" +#include "db/db_impl/db_impl_readonly.h" #include "db/compacted_db_impl.h" -#include "db/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" +#include "db/db_impl/db_impl.h" #include "monitoring/perf_context_imp.h" namespace rocksdb { diff --git a/db/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h similarity index 99% rename from db/db_impl_readonly.h rename to db/db_impl/db_impl_readonly.h index 23816210dc8..18df900cba0 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -9,7 +9,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/db/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc similarity index 99% rename from db/db_impl_secondary.cc rename to db/db_impl/db_impl_secondary.cc index a8ea921a260..a976a5750dd 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl_secondary.h" +#include "db/db_impl/db_impl_secondary.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h similarity index 99% rename from db/db_impl_secondary.h rename to db/db_impl/db_impl_secondary.h index a57835432dc..24cfd33c11d 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -9,7 +9,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/db/db_impl_write.cc b/db/db_impl/db_impl_write.cc similarity index 99% rename from db/db_impl_write.cc rename to db/db_impl/db_impl_write.cc index 98463f7b27f..02e23e26931 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -6,7 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS diff --git a/db/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc similarity index 99% rename from db/db_secondary_test.cc rename to db/db_impl/db_secondary_test.cc index 23132434f1f..c9184281c22 100644 --- a/db/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -7,8 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl_secondary.h" #include "db/db_test_util.h" +#include "db/db_impl/db_impl_secondary.h" #include "port/stack_trace.h" #include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" diff --git a/db/db_iter.h b/db/db_iter.h index 85b546c544c..9a6df9610a4 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -10,10 +10,10 @@ #pragma once #include #include -#include "db/db_impl.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "memory/arena.h" +#include "db/db_impl/db_impl.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" diff --git a/db/db_options_test.cc b/db/db_options_test.cc index b899ba18b4a..36ecf3a1b57 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -11,8 +11,8 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" #include "db/db_test_util.h" +#include "db/db_impl/db_impl.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" diff --git a/db/db_test.cc b/db/db_test.cc index debb2ba603e..4c4bd382ca8 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -24,7 +24,7 @@ #endif #include "cache/lru_cache.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/job_context.h" diff --git a/db/db_test_util.h b/db/db_test_util.h index 2af202fad96..4e9fcafadfa 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -25,7 +25,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "env/mock_env.h" #include "file/filename.h" diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 280d269f1c6..18014e5b435 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -13,7 +13,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" diff --git a/db/error_handler.cc b/db/error_handler.cc index 140fb4850f6..1d818f48948 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -4,8 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" -#include "db/db_impl.h" #include "db/event_helpers.h" +#include "db/db_impl/db_impl.h" #include "file/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/experimental.cc b/db/experimental.cc index d509a37bf2e..0c3c3335d92 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -5,7 +5,7 @@ #include "rocksdb/experimental.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { namespace experimental { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 00619d447d1..e6ce1fa8364 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -11,9 +11,9 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#include "db/db_impl.h" #include "db/log_format.h" #include "db/version_set.h" +#include "db/db_impl/db_impl.h" #include "env/mock_env.h" #include "file/filename.h" #include "rocksdb/cache.h" diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 9e0823366d0..2633a3ff9bd 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -11,7 +11,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/job_context.h" diff --git a/db/in_memory_stats_history.cc b/db/in_memory_stats_history.cc index e9e0cc74950..41fdb71c8c1 100644 --- a/db/in_memory_stats_history.cc +++ b/db/in_memory_stats_history.cc @@ -6,8 +6,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" #include "db/in_memory_stats_history.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 58332f30faf..21dde297ab6 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -22,7 +22,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "table/block_based/block_based_table_factory.h" #include "util/string_util.h" diff --git a/db/listener_test.cc b/db/listener_test.cc index 81a0fa17678..6fabf197f2c 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -3,11 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "db/db_impl/db_impl.h" #include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 2b4ac6b84da..ca5283139a5 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -13,7 +13,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/memtable.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" diff --git a/db/merge_test.cc b/db/merge_test.cc index 13c35d2c017..1b62b5c2c57 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -7,8 +7,8 @@ #include #include -#include "db/db_impl.h" #include "db/dbformat.h" +#include "db/db_impl/db_impl.h" #include "db/write_batch_internal.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" @@ -18,6 +18,9 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" #include "test_util/testharness.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "db/db_impl/db_impl.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 655c659b44f..3a78869c95d 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -13,7 +13,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" diff --git a/db/options_file_test.cc b/db/options_file_test.cc index c7eba52c290..b86ecefa97a 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "rocksdb/options.h" #include "rocksdb/table.h" diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index a73dd3cb431..d2d0426e652 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -12,7 +12,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 3f2e794a6c4..19f02f1099a 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -17,7 +17,7 @@ int main() { #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "monitoring/histogram.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 8f86528ecb2..7c188aeaa07 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -5,7 +5,7 @@ #include "db/range_del_aggregator.h" -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index ce7897a975a..96cfb581309 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -13,7 +13,7 @@ #include #include -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" diff --git a/db/repair.cc b/db/repair.cc index 577c122bcf9..400e754ba45 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -66,7 +66,7 @@ #include #include "db/builder.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/log_reader.h" #include "db/log_writer.h" diff --git a/db/repair_test.cc b/db/repair_test.cc index 1851cde0dfc..21907e43575 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -9,7 +9,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "file/file_util.h" #include "rocksdb/comparator.h" diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index e818f46142c..a9895bbedba 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -9,10 +9,10 @@ #include #include -#include "db/db_impl.h" #include "db/dbformat.h" #include "db/table_properties_collector.h" +#include "db/db_impl/db_impl.h" #include "options/cf_options.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" diff --git a/db/version_set.cc b/db/version_set.cc index 5d0529d2707..26465a01a4e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -23,7 +23,7 @@ #include #include #include -#include "db/compaction.h" +#include "compaction/compaction.h" #include "db/internal_stats.h" #include "db/log_reader.h" #include "db/log_writer.h" diff --git a/db/version_set.h b/db/version_set.h index 776e08e448c..c43e4091442 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -29,8 +29,8 @@ #include #include "db/column_family.h" -#include "db/compaction.h" -#include "db/compaction_picker.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker.h" #include "db/dbformat.h" #include "db/file_indexer.h" #include "db/log_reader.h" diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index c0c47b0c34b..3657fb691be 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -13,7 +13,7 @@ #include "rocksdb/write_buffer_manager.h" #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "db/version_set.h" #include "db/wal_manager.h" diff --git a/db/write_batch.cc b/db/write_batch.cc index 830fbeab15d..1459e5a3aae 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -42,13 +42,13 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/memtable.h" #include "db/merge_context.h" #include "db/snapshot_impl.h" #include "db/write_batch_internal.h" +#include "db/db_impl/db_impl.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/merge_operator.h" diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index aa3d077c40d..b5e26a8a7f0 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -11,7 +11,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/write_callback.h" #include "port/port.h" #include "rocksdb/db.h" diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index d63170452c0..efd9e30e6a5 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -12,7 +12,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index b506ece2796..89304227807 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -11,7 +11,7 @@ #include "port/port.h" -#include "db/compaction.h" +#include "db/compaction/compaction.h" #include "db/error_handler.h" #include "file/delete_scheduler.h" #include "rocksdb/sst_file_manager.h" diff --git a/src.mk b/src.mk index 44013bc2e1d..5021acb96ac 100644 --- a/src.mk +++ b/src.mk @@ -7,24 +7,24 @@ LIB_SOURCES = \ db/c.cc \ db/column_family.cc \ db/compacted_db_impl.cc \ - db/compaction.cc \ - db/compaction_iterator.cc \ - db/compaction_job.cc \ - db/compaction_picker.cc \ - db/compaction_picker_fifo.cc \ - db/compaction_picker_level.cc \ - db/compaction_picker_universal.cc \ + db/compaction/compaction.cc \ + db/compaction/compaction_iterator.cc \ + db/compaction/compaction_job.cc \ + db/compaction/compaction_picker.cc \ + db/compaction/compaction_picker_fifo.cc \ + db/compaction/compaction_picker_level.cc \ + db/compaction/compaction_picker_universal.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ - db/db_impl.cc \ - db/db_impl_compaction_flush.cc \ - db/db_impl_debug.cc \ - db/db_impl_experimental.cc \ - db/db_impl_files.cc \ - db/db_impl_open.cc \ - db/db_impl_readonly.cc \ - db/db_impl_secondary.cc \ - db/db_impl_write.cc \ + db/db_impl/db_impl.cc \ + db/db_impl/db_impl_compaction_flush.cc \ + db/db_impl/db_impl_debug.cc \ + db/db_impl/db_impl_experimental.cc \ + db/db_impl/db_impl_files.cc \ + db/db_impl/db_impl_open.cc \ + db/db_impl/db_impl_readonly.cc \ + db/db_impl/db_impl_secondary.cc \ + db/db_impl/db_impl_write.cc \ db/db_info_dumper.cc \ db/db_iter.cc \ db/dbformat.cc \ @@ -259,10 +259,10 @@ MAIN_SOURCES = \ cache/cache_test.cc \ db/column_family_test.cc \ db/compact_files_test.cc \ - db/compaction_iterator_test.cc \ - db/compaction_job_stats_test.cc \ - db/compaction_job_test.cc \ - db/compaction_picker_test.cc \ + db/compaction/compaction_iterator_test.cc \ + db/compaction/compaction_job_test.cc \ + db/compaction/compaction_job_stats_test.cc \ + db/compaction/compaction_picker_test.cc \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ @@ -286,7 +286,7 @@ MAIN_SOURCES = \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ - db/db_secondary_test.cc \ + db/db_impl/db_secondary_test.cc \ db/db_sst_test.cc \ db/db_statistics_test.cc \ db/db_table_properties_test.cc \ diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 822c2294bb7..2ec7b2d0fb5 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -11,7 +11,7 @@ int main() { } #else -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "monitoring/histogram.h" #include "rocksdb/db.h" diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index d80502f16fa..c6f19bed585 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -33,7 +33,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/malloc_stats.h" #include "db/version_set.h" #include "hdfs/env_hdfs.h" diff --git a/tools/db_stress.cc b/tools/db_stress.cc index c112cb348ff..0c828deb165 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -44,7 +44,7 @@ int main() { #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "hdfs/env_hdfs.h" #include "monitoring/histogram.h" diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 10e9a495d23..d6f9b415707 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -1,3 +1,4 @@ + // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License @@ -12,7 +13,7 @@ #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/log_reader.h" #include "db/write_batch_internal.h" diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index 3aa0e3cf36d..8b23dbf369d 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "rocksdb/db.h" #include "rocksdb/utilities/ldb_cmd.h" diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 03057afbc78..93528c00608 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -27,7 +27,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "options/cf_options.h" diff --git a/util/trace_replay.cc b/util/trace_replay.cc index c90fef2eff8..9e0e8c48cde 100644 --- a/util/trace_replay.cc +++ b/util/trace_replay.cc @@ -8,7 +8,8 @@ #include #include #include -#include "db/db_impl.h" + +#include "db/db_impl/db_impl.h" #include "rocksdb/slice.h" #include "rocksdb/write_batch.h" #include "util/coding.h" diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 05006d6a3eb..37d9e4cd182 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -12,7 +12,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "env/env_chroot.h" #include "file/filename.h" #include "port/port.h" diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 04b7eb73e2b..7f447a04ad0 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -11,7 +11,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/write_batch_internal.h" #include "file/file_util.h" #include "file/filename.h" diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index e14307d44cd..4475772d8d1 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -1,3 +1,4 @@ + // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License @@ -17,7 +18,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "file/filename.h" #include "util/logging.h" diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index 431ef697929..cec9ce7d88f 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index da2972affd7..d7d2548af3e 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -16,7 +16,7 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/db.h" diff --git a/utilities/debug.cc b/utilities/debug.cc index 72fcbf0f54d..8ddf64b5dc4 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -7,7 +7,7 @@ #include "rocksdb/utilities/debug.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 29903d460f2..75fb9cd3f92 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/cache.h" #include "rocksdb/table.h" #include "rocksdb/utilities/memory_util.h" diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc index 83bf33c1794..47ca4b7bb7d 100644 --- a/utilities/memory/memory_util.cc +++ b/utilities/memory/memory_util.cc @@ -7,7 +7,7 @@ #include "rocksdb/utilities/memory_util.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc index 48c9180ae9e..e8cf6eade4e 100644 --- a/utilities/transactions/optimistic_transaction.cc +++ b/utilities/transactions/optimistic_transaction.cc @@ -10,7 +10,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/status.h" diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index d9db6fde07e..b7fedc06615 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -10,7 +10,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/optimistic_transaction_db.h" diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 93d75a8357f..ed7444894c7 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -13,7 +13,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/snapshot.h" diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 8920f85fb76..e906b444ff5 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -16,7 +16,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 68b87b5aa47..d4923a88f4c 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -13,7 +13,7 @@ #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/column_family.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 0750b249bbb..6c9f4bccd62 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -16,7 +16,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 22dc208f523..da2a08d3c52 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -15,7 +15,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction.h" diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc index ec6f7e60ae2..c582b73aa3e 100644 --- a/utilities/transactions/transaction_util.cc +++ b/utilities/transactions/transaction_util.cc @@ -15,7 +15,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/status.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "util/string_util.h" diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index c0a7e278054..8b52b1ae662 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -18,7 +18,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "rocksdb/db.h" #include "rocksdb/options.h" diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 6c7cb359dc4..05650e2b3f9 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -16,7 +16,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/status.h" #include "rocksdb/utilities/transaction_db.h" diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index b4a71f5ea6c..bf94d83d82b 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -17,7 +17,7 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 823b12ea171..efd766514c8 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE #include "utilities/transactions/write_unprepared_txn.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "util/cast_util.h" #include "utilities/transactions/write_unprepared_txn_db.h" diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 8bf064a0466..69e991ed855 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -16,7 +16,7 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/utility_db.h" #include "rocksdb/utilities/db_ttl.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #ifdef _WIN32 // Windows API macro interference diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 0f8f6c1d622..cf17abf22e9 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -10,7 +10,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/merge_context.h" #include "db/merge_helper.h" #include "memory/arena.h" From cae22c53fbad071be8aa3a8543415383b4dfaef4 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Fri, 31 May 2019 15:21:36 -0700 Subject: [PATCH 097/572] Make format Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5395 Differential Revision: D15581698 Pulled By: vjnadimpalli fbshipit-source-id: f415972f16e784b1361714c202b97defcab46767 --- db/column_family_test.cc | 2 +- db/compacted_db_impl.h | 4 ++-- db/compaction/compaction_iterator.h | 2 +- db/compaction/compaction_job.cc | 2 +- db/db_impl/db_impl.h | 2 +- db/db_impl/db_impl_readonly.cc | 2 +- db/db_impl/db_secondary_test.cc | 2 +- db/db_iter.h | 2 +- db/db_options_test.cc | 2 +- db/error_handler.cc | 2 +- db/fault_injection_test.cc | 2 +- db/listener_test.cc | 2 +- db/merge_test.cc | 5 +---- db/write_batch.cc | 2 +- utilities/transactions/transaction_base.cc | 2 +- utilities/ttl/db_ttl_impl.h | 6 +++--- 16 files changed, 19 insertions(+), 22 deletions(-) diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 9374a135866..63d987f3c99 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -12,8 +12,8 @@ #include #include -#include "db/db_test_util.h" #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "memtable/hash_skiplist_rep.h" #include "options/options_parser.h" #include "port/port.h" diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index 8c1a1428c81..c1b8da9a782 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -5,9 +5,9 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/db_impl/db_impl.h" -#include #include +#include +#include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 9744ab8dfc8..86a2b87b22c 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -10,8 +10,8 @@ #include #include -#include "db/compaction/compaction_iteration_stats.h" #include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 3866d70ee00..b782c6ca7ad 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -23,8 +23,8 @@ #include #include -#include "db/compaction/compaction_job.h" #include "db/builder.h" +#include "db/compaction/compaction_job.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 27d39f90d24..c241a36dbc3 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -30,6 +30,7 @@ #include "db/internal_stats.h" #include "db/log_writer.h" #include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" #include "db/pre_release_callback.h" #include "db/range_del_aggregator.h" #include "db/read_callback.h" @@ -39,7 +40,6 @@ #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "db/memtable_list.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 55249228456..6db498397ce 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -6,9 +6,9 @@ #include "db/db_impl/db_impl_readonly.h" #include "db/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" -#include "db/db_impl/db_impl.h" #include "monitoring/perf_context_imp.h" namespace rocksdb { diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index c9184281c22..e8eafd673ed 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -7,8 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_test_util.h" #include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" #include "port/stack_trace.h" #include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" diff --git a/db/db_iter.h b/db/db_iter.h index 9a6df9610a4..6a4bf8a5507 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -10,10 +10,10 @@ #pragma once #include #include +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "memory/arena.h" -#include "db/db_impl/db_impl.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 36ecf3a1b57..a9c8d218235 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -11,8 +11,8 @@ #include #include "db/column_family.h" -#include "db/db_test_util.h" #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" diff --git a/db/error_handler.cc b/db/error_handler.cc index 1d818f48948..9e1bf5cc107 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -4,8 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" -#include "db/event_helpers.h" #include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" #include "file/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index e6ce1fa8364..126addc80d1 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -11,9 +11,9 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". +#include "db/db_impl/db_impl.h" #include "db/log_format.h" #include "db/version_set.h" -#include "db/db_impl/db_impl.h" #include "env/mock_env.h" #include "file/filename.h" #include "rocksdb/cache.h" diff --git a/db/listener_test.cc b/db/listener_test.cc index 6fabf197f2c..5d8f6eb5e63 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -3,11 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/version_set.h" #include "db/write_batch_internal.h" -#include "db/db_impl/db_impl.h" #include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" diff --git a/db/merge_test.cc b/db/merge_test.cc index 1b62b5c2c57..2965045d9df 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -7,8 +7,8 @@ #include #include -#include "db/dbformat.h" #include "db/db_impl/db_impl.h" +#include "db/dbformat.h" #include "db/write_batch_internal.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" @@ -18,9 +18,6 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" #include "test_util/testharness.h" -#include "db/dbformat.h" -#include "db/write_batch_internal.h" -#include "db/db_impl/db_impl.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/write_batch.cc b/db/write_batch.cc index 1459e5a3aae..d7a2e792a33 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -42,13 +42,13 @@ #include #include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/memtable.h" #include "db/merge_context.h" #include "db/snapshot_impl.h" #include "db/write_batch_internal.h" -#include "db/db_impl/db_impl.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/merge_operator.h" diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index d4923a88f4c..6553b49614c 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -13,8 +13,8 @@ #include -#include "db/db_impl/db_impl.h" #include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/status.h" diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 69e991ed855..593cd64a0fc 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -10,13 +10,13 @@ #include #include +#include "db/db_impl/db_impl.h" +#include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/compaction_filter.h" #include "rocksdb/merge_operator.h" -#include "rocksdb/utilities/utility_db.h" #include "rocksdb/utilities/db_ttl.h" -#include "db/db_impl/db_impl.h" +#include "rocksdb/utilities/utility_db.h" #ifdef _WIN32 // Windows API macro interference From d7d8605f56fd4f881869395aa06f9c5f259b5020 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Fri, 31 May 2019 16:59:00 -0700 Subject: [PATCH 098/572] Fix a clang analyze warning (#5398) Summary: Clang analyzer is reporting a false positive warning thinking `type` is uninitialized. The variable is initialized by `ParseFileName` by reference so assigning a default value to keep clang happy. Current failure: ``` file/filename.cc:435:15: warning: The left operand of '==' is a garbage value (type == kInfoLogFile)) { ~~~~ ^ 1 warning generated. ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5398 Differential Revision: D15588421 Pulled By: miasantreble fbshipit-source-id: fb121c270300f3a659e68bc7f6674ff4ddf2df9a --- file/filename.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/file/filename.cc b/file/filename.cc index 6f00d15ebca..77d9569d3a9 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -413,7 +413,7 @@ Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, assert(parent_dir != nullptr); assert(info_log_list != nullptr); uint64_t number = 0; - FileType type; + FileType type = kLogFile; if (!db_log_dir.empty()) { *parent_dir = db_log_dir; From 79edf0a7a8ab75f60692efd54b1e0ed7da7aafca Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 31 May 2019 17:14:03 -0700 Subject: [PATCH 099/572] util: fix log_write_bench (#5335) Summary: log_write_bench doesn't compile due to some recent API changes. This patch fixes the compile by adding the missing params for OptimizeForLogWrite() and WritableFileWriter(). Signed-off-by: Yuan Zhou Pull Request resolved: https://github.com/facebook/rocksdb/pull/5335 Differential Revision: D15588875 Pulled By: miasantreble fbshipit-source-id: 726ff4dc227733e915c3b796df25bd3ab0b431ac --- util/log_write_bench.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index 9efa43f8a3c..ac4cb685b6e 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -32,13 +32,16 @@ DEFINE_bool(enable_sync, false, "sync after each write."); namespace rocksdb { void RunBenchmark() { std::string file_name = test::PerThreadDBPath("log_write_benchmark.log"); + DBOptions options; Env* env = Env::Default(); - EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions()); + EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options); env_options.bytes_per_sync = FLAGS_bytes_per_sync; std::unique_ptr file; env->NewWritableFile(file_name, &file, env_options); std::unique_ptr writer; - writer.reset(new WritableFileWriter(std::move(file), env_options)); + writer.reset(new WritableFileWriter(std::move(file), file_name, env_options, + env, nullptr /* stats */, + options.listeners)); std::string record; record.assign(FLAGS_record_size, 'X'); From 000b9ec217663faad1d0196b28c623149e01e024 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 31 May 2019 17:19:43 -0700 Subject: [PATCH 100/572] Move some logging related files to logging/ (#5387) Summary: Many logging related source files are under util/. It will be more structured if they are together. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5387 Differential Revision: D15579036 Pulled By: siying fbshipit-source-id: 3850134ed50b8c0bb40a0c8ae1f184fa4081303f --- CMakeLists.txt | 10 +++++----- Makefile | 4 ++-- TARGETS | 10 +++++----- db/builder.h | 2 +- db/compaction/compaction_job.cc | 4 ++-- db/compaction/compaction_job.h | 2 +- db/compaction/compaction_job_stats_test.cc | 2 +- db/compaction/compaction_picker.cc | 2 +- db/compaction/compaction_picker_fifo.cc | 2 +- db/compaction/compaction_picker_level.cc | 2 +- db/compaction/compaction_picker_test.cc | 2 +- db/compaction/compaction_picker_universal.cc | 2 +- db/db_impl/db_impl.cc | 6 +++--- db/db_impl/db_impl.h | 3 ++- db/db_impl/db_impl_secondary.cc | 2 +- db/db_iter.cc | 2 +- db/dbformat.h | 2 +- db/dbformat_test.cc | 2 +- db/event_helpers.h | 2 +- db/fault_injection_test.cc | 2 +- db/filename_test.cc | 2 +- db/flush_job.cc | 6 +++--- db/flush_job.h | 2 +- db/listener_test.cc | 2 +- db/memtable_list.cc | 2 +- db/memtable_list.h | 2 +- db/plain_table_db_test.cc | 2 +- db/version_builder_test.cc | 2 +- db/version_edit.cc | 2 +- db/version_set_test.cc | 2 +- db/wal_manager.cc | 2 +- env/env_hdfs.cc | 2 +- env/env_posix.cc | 4 ++-- env/env_test.cc | 2 +- env/io_posix.cc | 2 +- file/delete_scheduler.cc | 2 +- file/filename.cc | 2 +- java/rocksjni/write_batch.cc | 2 +- {util => logging}/auto_roll_logger.cc | 5 +++-- {util => logging}/auto_roll_logger.h | 0 {util => logging}/auto_roll_logger_test.cc | 4 ++-- {util => logging}/event_logger.cc | 4 ++-- {util => logging}/event_logger.h | 2 +- {util => logging}/event_logger_test.cc | 2 +- {util => logging}/log_buffer.cc | 2 +- {util => logging}/log_buffer.h | 0 {util => logging}/logging.h | 2 +- {env => logging}/posix_logger.h | 0 memory/arena.cc | 2 +- options/db_options.cc | 2 +- port/port_posix.cc | 4 ++-- port/util_logger.h | 2 +- port/win/port_win.cc | 2 +- src.mk | 10 +++++----- table/block_based/block.cc | 2 +- table/block_based/partitioned_filter_block_test.cc | 2 +- table/block_fetcher.cc | 2 +- table/format.cc | 2 +- test_util/transaction_test_util.cc | 2 +- tools/db_stress.cc | 2 +- util/bloom_test.cc | 2 +- util/comparator.cc | 8 ++++---- util/dynamic_bloom_test.cc | 2 +- utilities/backupable/backupable_db.cc | 2 +- utilities/blob_db/blob_db_impl.cc | 2 +- utilities/blob_db/blob_db_impl_filesnapshot.cc | 2 +- utilities/blob_db/blob_file.cc | 2 +- utilities/merge_operators/uint64add.cc | 2 +- utilities/persistent_cache/block_cache_tier.cc | 2 +- utilities/persistent_cache/block_cache_tier_file.cc | 2 +- utilities/transactions/optimistic_transaction_test.cc | 2 +- 71 files changed, 96 insertions(+), 94 deletions(-) rename {util => logging}/auto_roll_logger.cc (99%) rename {util => logging}/auto_roll_logger.h (100%) rename {util => logging}/auto_roll_logger_test.cc (99%) rename {util => logging}/event_logger.cc (96%) rename {util => logging}/event_logger.h (99%) rename {util => logging}/event_logger_test.cc (97%) rename {util => logging}/log_buffer.cc (98%) rename {util => logging}/log_buffer.h (100%) rename {util => logging}/logging.h (98%) rename {env => logging}/posix_logger.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c2fa7119c2..1b5f03a0f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -549,6 +549,9 @@ set(SOURCES file/file_util.cc file/filename.cc file/sst_file_manager_impl.cc + logging/auto_roll_logger.cc + logging/event_logger.cc + logging/log_buffer.cc memory/arena.cc memory/concurrent_arena.cc memory/jemalloc_nodump_allocator.cc @@ -620,7 +623,6 @@ set(SOURCES tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc - util/auto_roll_logger.cc util/bloom.cc util/coding.cc util/compaction_job_stats_impl.cc @@ -629,11 +631,9 @@ set(SOURCES util/concurrent_task_limiter_impl.cc util/crc32c.cc util/dynamic_bloom.cc - util/event_logger.cc util/file_reader_writer.cc util/filter_policy.cc util/hash.cc - util/log_buffer.cc util/murmurhash.cc util/random.cc util/rate_limiter.cc @@ -939,6 +939,8 @@ if(WITH_TESTS) env/env_test.cc env/mock_env_test.cc file/delete_scheduler_test.cc + logging/auto_roll_logger_test.cc + logging/event_logger_test.cc memory/arena_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc @@ -963,13 +965,11 @@ if(WITH_TESTS) tools/reduce_levels_test.cc tools/sst_dump_test.cc tools/trace_analyzer_test.cc - util/auto_roll_logger_test.cc util/autovector_test.cc util/bloom_test.cc util/coding_test.cc util/crc32c_test.cc util/dynamic_bloom_test.cc - util/event_logger_test.cc util/file_reader_writer_test.cc util/filelock_test.cc util/hash_test.cc diff --git a/Makefile b/Makefile index 5181154a212..080e0713355 100644 --- a/Makefile +++ b/Makefile @@ -1498,7 +1498,7 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS) $(AM_LINK) -event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1519,7 +1519,7 @@ manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/TARGETS b/TARGETS index edddc7b99be..da4f4d9a61d 100644 --- a/TARGETS +++ b/TARGETS @@ -148,6 +148,9 @@ cpp_library( "file/file_util.cc", "file/filename.cc", "file/sst_file_manager_impl.cc", + "logging/auto_roll_logger.cc", + "logging/event_logger.cc", + "logging/log_buffer.cc", "memory/arena.cc", "memory/concurrent_arena.cc", "memory/jemalloc_nodump_allocator.cc", @@ -218,7 +221,6 @@ cpp_library( "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", - "util/auto_roll_logger.cc", "util/bloom.cc", "util/build_version.cc", "util/coding.cc", @@ -228,11 +230,9 @@ cpp_library( "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", "util/dynamic_bloom.cc", - "util/event_logger.cc", "util/file_reader_writer.cc", "util/filter_policy.cc", "util/hash.cc", - "util/log_buffer.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", @@ -359,7 +359,7 @@ ROCKS_TESTS = [ ], [ "auto_roll_logger_test", - "util/auto_roll_logger_test.cc", + "logging/auto_roll_logger_test.cc", "serial", ], [ @@ -699,7 +699,7 @@ ROCKS_TESTS = [ ], [ "event_logger_test", - "util/event_logger_test.cc", + "logging/event_logger_test.cc", "serial", ], [ diff --git a/db/builder.h b/db/builder.h index 34a4bff1a25..4fa56f50e34 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,6 +11,7 @@ #include #include "db/range_tombstone_fragmenter.h" #include "db/table_properties_collector.h" +#include "logging/event_logger.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" @@ -20,7 +21,6 @@ #include "rocksdb/table_properties.h" #include "rocksdb/types.h" #include "table/scoped_arena_iterator.h" -#include "util/event_logger.h" namespace rocksdb { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index b782c6ca7ad..5761345d8a2 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -40,6 +40,8 @@ #include "db/version_set.h" #include "file/filename.h" #include "file/sst_file_manager_impl.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -56,8 +58,6 @@ #include "test_util/sync_point.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 1387fffb1c1..84d38c163eb 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -29,6 +29,7 @@ #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" +#include "logging/event_logger.h" #include "options/cf_options.h" #include "options/db_options.h" #include "port/port.h" @@ -40,7 +41,6 @@ #include "rocksdb/transaction_log.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/stop_watch.h" #include "util/thread_local.h" diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index 91310e9f112..5fb805df5f0 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -28,6 +28,7 @@ #include "db/write_batch_internal.h" #include "env/mock_env.h" #include "file/filename.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "monitoring/thread_status_util.h" @@ -57,7 +58,6 @@ #include "test_util/testutil.h" #include "util/compression.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 4276ea9cb41..a03f7b46fd1 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -21,9 +21,9 @@ #include #include "db/column_family.h" #include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/statistics.h" #include "test_util/sync_point.h" -#include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index ffb5a9f6495..1fc6ed113d2 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -18,7 +18,7 @@ #include #include #include "db/column_family.h" -#include "util/log_buffer.h" +#include "logging/log_buffer.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index aeb368ea20a..e9653da8e55 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -16,8 +16,8 @@ #include #include "db/compaction/compaction_picker_level.h" +#include "logging/log_buffer.h" #include "test_util/sync_point.h" -#include "util/log_buffer.h" namespace rocksdb { diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index bab93227a4f..58a0a12f03e 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -12,9 +12,9 @@ #include "db/compaction/compaction_picker_level.h" #include "db/compaction/compaction_picker_universal.h" +#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/logging.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 465245715fd..e8aca00be81 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -21,9 +21,9 @@ #include #include "db/column_family.h" #include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/statistics.h" #include "test_util/sync_point.h" -#include "util/log_buffer.h" #include "util/random.h" #include "util/string_util.h" diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 196e38f14fa..9675e727dde 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -56,6 +56,9 @@ #include "file/file_util.h" #include "file/filename.h" #include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "memtable/hash_skiplist_rep.h" #include "monitoring/iostats_context_imp.h" @@ -86,15 +89,12 @@ #include "table/two_level_iterator.h" #include "test_util/sync_point.h" #include "tools/sst_dump_tool_imp.h" -#include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index c241a36dbc3..5461ef300aa 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -40,6 +40,8 @@ #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" +#include "db/memtable_list.h" +#include "logging/event_logger.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" @@ -52,7 +54,6 @@ #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/hash.h" #include "util/repeatable_thread.h" #include "util/stop_watch.h" diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index a976a5750dd..34364d124a8 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -12,8 +12,8 @@ #include "db/db_iter.h" #include "db/merge_context.h" +#include "logging/auto_roll_logger.h" #include "monitoring/perf_context_imp.h" -#include "util/auto_roll_logger.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index d953d365e0f..bcfed2bb021 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -17,6 +17,7 @@ #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "file/filename.h" +#include "logging/logging.h" #include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" @@ -25,7 +26,6 @@ #include "rocksdb/options.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" #include "util/trace_replay.h" diff --git a/db/dbformat.h b/db/dbformat.h index 437119fb775..dbf6ea6f3c9 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -14,6 +14,7 @@ #include #include "db/lookup_key.h" #include "db/merge_context.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -23,7 +24,6 @@ #include "rocksdb/table.h" #include "rocksdb/types.h" #include "util/coding.h" -#include "util/logging.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index f4665b06ca3..9ec1bc34348 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -8,8 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" +#include "logging/logging.h" #include "test_util/testharness.h" -#include "util/logging.h" namespace rocksdb { diff --git a/db/event_helpers.h b/db/event_helpers.h index ea35b4b5b19..88c72cd4e13 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -10,9 +10,9 @@ #include "db/column_family.h" #include "db/version_edit.h" +#include "logging/event_logger.h" #include "rocksdb/listener.h" #include "rocksdb/table_properties.h" -#include "util/event_logger.h" namespace rocksdb { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 126addc80d1..1d18569f2f4 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -16,6 +16,7 @@ #include "db/version_set.h" #include "env/mock_env.h" #include "file/filename.h" +#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -25,7 +26,6 @@ #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/logging.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/db/filename_test.cc b/db/filename_test.cc index 377d128fae0..bc52e0eae64 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -10,9 +10,9 @@ #include "file/filename.h" #include "db/dbformat.h" +#include "logging/logging.h" #include "port/port.h" #include "test_util/testharness.h" -#include "util/logging.h" namespace rocksdb { diff --git a/db/flush_job.cc b/db/flush_job.cc index d4ae79ff29a..2b2696c10ba 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -31,6 +31,9 @@ #include "db/version_set.h" #include "file/file_util.h" #include "file/filename.h" +#include "logging/event_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -47,9 +50,6 @@ #include "table/two_level_iterator.h" #include "test_util/sync_point.h" #include "util/coding.h" -#include "util/event_logger.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/stop_watch.h" diff --git a/db/flush_job.h b/db/flush_job.h index c4081945623..fdb0917bdba 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -28,6 +28,7 @@ #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" +#include "logging/event_logger.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" @@ -37,7 +38,6 @@ #include "rocksdb/transaction_log.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/stop_watch.h" #include "util/thread_local.h" diff --git a/db/listener_test.cc b/db/listener_test.cc index 5d8f6eb5e63..9fbd5d0d3ff 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -9,6 +9,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "rocksdb/cache.h" @@ -28,7 +29,6 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" diff --git a/db/memtable_list.cc b/db/memtable_list.cc index ca5283139a5..045bfc9a2d3 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -17,6 +17,7 @@ #include "db/memtable.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" +#include "logging/log_buffer.h" #include "monitoring/thread_status_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -24,7 +25,6 @@ #include "table/merging_iterator.h" #include "test_util/sync_point.h" #include "util/coding.h" -#include "util/log_buffer.h" namespace rocksdb { diff --git a/db/memtable_list.h b/db/memtable_list.h index a5f0c123292..a72077ff3d5 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -17,13 +17,13 @@ #include "db/memtable.h" #include "db/range_del_aggregator.h" #include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/instrumented_mutex.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/types.h" #include "util/autovector.h" -#include "util/log_buffer.h" namespace rocksdb { diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index d2d0426e652..68df71768e2 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -16,6 +16,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" +#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -32,7 +33,6 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" #include "utilities/merge_operators.h" diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 63067857420..3a144190cf1 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -6,9 +6,9 @@ #include #include "db/version_edit.h" #include "db/version_set.h" +#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/logging.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/version_edit.cc b/db/version_edit.cc index 668ff60f103..ecadf6e3980 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -10,10 +10,10 @@ #include "db/version_edit.h" #include "db/version_set.h" +#include "logging/event_logger.h" #include "rocksdb/slice.h" #include "test_util/sync_point.h" #include "util/coding.h" -#include "util/event_logger.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 9b4072dc777..77890d82638 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -9,10 +9,10 @@ #include "db/version_set.h" #include "db/log_writer.h" +#include "logging/logging.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/logging.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 2fe5305f8d6..71c2ffe4b22 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -24,6 +24,7 @@ #include "db/write_batch_internal.h" #include "file/file_util.h" #include "file/filename.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/options.h" @@ -32,7 +33,6 @@ #include "util/cast_util.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index 9d0354cced8..5bdf03ae3e1 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -17,8 +17,8 @@ #include #include #include +#include "logging/logging.h" #include "rocksdb/status.h" -#include "util/logging.h" #include "util/string_util.h" #define HDFS_EXISTS 0 diff --git a/env/env_posix.cc b/env/env_posix.cc index bf1a9e0e5c4..7eb5b7c1451 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -43,7 +43,8 @@ #include #include "env/io_posix.h" -#include "env/posix_logger.h" +#include "logging/logging.h" +#include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" #include "port/port.h" @@ -52,7 +53,6 @@ #include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" -#include "util/logging.h" #include "util/random.h" #include "util/string_util.h" #include "util/thread_local.h" diff --git a/env/env_test.cc b/env/env_test.cc index 615eca8b400..e8cb9b24534 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -38,13 +38,13 @@ #endif #include "env/env_chroot.h" +#include "logging/log_buffer.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" -#include "util/log_buffer.h" #include "util/mutexlock.h" #include "util/string_util.h" diff --git a/env/io_posix.cc b/env/io_posix.cc index 313cbd8eee6..8b42a636295 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -27,7 +27,7 @@ #include #include #endif -#include "env/posix_logger.h" +#include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" #include "rocksdb/slice.h" diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 22f28f5375f..b66956ca08c 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -11,10 +11,10 @@ #include #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" -#include "util/logging.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/file/filename.cc b/file/filename.cc index 77d9569d3a9..c9f22e585b7 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -16,10 +16,10 @@ #include #include #include +#include "logging/logging.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/stop_watch.h" #include "util/string_util.h" diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index f1b77446c02..c6d0b9072ae 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -11,6 +11,7 @@ #include "db/write_batch_internal.h" #include "include/org_rocksdb_WriteBatch.h" #include "include/org_rocksdb_WriteBatch_Handler.h" +#include "logging/logging.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" @@ -20,7 +21,6 @@ #include "rocksjni/portal.h" #include "rocksjni/writebatchhandlerjnicallback.h" #include "table/scoped_arena_iterator.h" -#include "util/logging.h" /* * Class: org_rocksdb_WriteBatch diff --git a/util/auto_roll_logger.cc b/logging/auto_roll_logger.cc similarity index 99% rename from util/auto_roll_logger.cc rename to logging/auto_roll_logger.cc index 9e8d6750319..ec240f5a334 100644 --- a/util/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -3,10 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "util/auto_roll_logger.h" +#include "logging/auto_roll_logger.h" + #include #include "file/filename.h" -#include "util/logging.h" +#include "logging/logging.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/util/auto_roll_logger.h b/logging/auto_roll_logger.h similarity index 100% rename from util/auto_roll_logger.h rename to logging/auto_roll_logger.h diff --git a/util/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc similarity index 99% rename from util/auto_roll_logger_test.cc rename to logging/auto_roll_logger_test.cc index ff47719d490..cce98d374ef 100644 --- a/util/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE -#include "util/auto_roll_logger.h" +#include "logging/auto_roll_logger.h" #include #include #include @@ -17,11 +17,11 @@ #include #include #include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" -#include "util/logging.h" namespace rocksdb { namespace { diff --git a/util/event_logger.cc b/logging/event_logger.cc similarity index 96% rename from util/event_logger.cc rename to logging/event_logger.cc index b488984f350..aceccdf93c0 100644 --- a/util/event_logger.cc +++ b/logging/event_logger.cc @@ -7,14 +7,14 @@ #define __STDC_FORMAT_MACROS #endif -#include "util/event_logger.h" +#include "logging/event_logger.h" #include #include #include #include -#include "util/logging.h" +#include "logging/logging.h" #include "util/string_util.h" namespace rocksdb { diff --git a/util/event_logger.h b/logging/event_logger.h similarity index 99% rename from util/event_logger.h rename to logging/event_logger.h index d88a6a4fe68..c3a7c30c601 100644 --- a/util/event_logger.h +++ b/logging/event_logger.h @@ -10,8 +10,8 @@ #include #include +#include "logging/log_buffer.h" #include "rocksdb/env.h" -#include "util/log_buffer.h" namespace rocksdb { diff --git a/util/event_logger_test.cc b/logging/event_logger_test.cc similarity index 97% rename from util/event_logger_test.cc rename to logging/event_logger_test.cc index 1ee0c4d9787..cc635d42fbf 100644 --- a/util/event_logger_test.cc +++ b/logging/event_logger_test.cc @@ -5,8 +5,8 @@ #include +#include "logging/event_logger.h" #include "test_util/testharness.h" -#include "util/event_logger.h" namespace rocksdb { diff --git a/util/log_buffer.cc b/logging/log_buffer.cc similarity index 98% rename from util/log_buffer.cc rename to logging/log_buffer.cc index d09e0cb002f..74db11c66e3 100644 --- a/util/log_buffer.cc +++ b/logging/log_buffer.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/log_buffer.h" +#include "logging/log_buffer.h" #include "port/sys_time.h" #include "port/port.h" diff --git a/util/log_buffer.h b/logging/log_buffer.h similarity index 100% rename from util/log_buffer.h rename to logging/log_buffer.h diff --git a/util/logging.h b/logging/logging.h similarity index 98% rename from util/logging.h rename to logging/logging.h index a4ef31bd6b5..cad90a309f1 100644 --- a/util/logging.h +++ b/logging/logging.h @@ -19,7 +19,7 @@ inline const char* RocksLogShorterFileName(const char* file) { - // 15 is the length of "util/logging.h". + // 15 is the length of "logging/logging.h". // If the name of this file changed, please change this number, too. return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0); } diff --git a/env/posix_logger.h b/logging/posix_logger.h similarity index 100% rename from env/posix_logger.h rename to logging/posix_logger.h diff --git a/memory/arena.cc b/memory/arena.cc index b774225535e..3f113e776a4 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -19,10 +19,10 @@ #include #endif #include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" -#include "util/logging.h" namespace rocksdb { diff --git a/options/db_options.cc b/options/db_options.cc index e180238f433..72e348b3227 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -11,12 +11,12 @@ #include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" #include "rocksdb/wal_filter.h" -#include "util/logging.h" namespace rocksdb { diff --git a/port/port_posix.cc b/port/port_posix.cc index 80081e480e0..f19d18ff0e6 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -18,11 +18,11 @@ #include #include #include -#include #include +#include #include #include -#include "util/logging.h" +#include "logging/logging.h" namespace rocksdb { diff --git a/port/util_logger.h b/port/util_logger.h index ba424705b27..d2d62a9879c 100644 --- a/port/util_logger.h +++ b/port/util_logger.h @@ -14,7 +14,7 @@ // of what the new port_.h file must provide. #if defined(ROCKSDB_PLATFORM_POSIX) -#include "env/posix_logger.h" +#include "logging/posix_logger.h" #elif defined(OS_WIN) #include "port/win/win_logger.h" #endif diff --git a/port/win/port_win.cc b/port/win/port_win.cc index 03ba6ef4281..31e65e78cde 100644 --- a/port/win/port_win.cc +++ b/port/win/port_win.cc @@ -33,7 +33,7 @@ #include #endif -#include "util/logging.h" +#include "logging/logging.h" namespace rocksdb { diff --git a/src.mk b/src.mk index 5021acb96ac..38835f8c6d2 100644 --- a/src.mk +++ b/src.mk @@ -72,6 +72,9 @@ LIB_SOURCES = \ file/file_util.cc \ file/filename.cc \ file/sst_file_manager_impl.cc \ + logging/auto_roll_logger.cc \ + logging/event_logger.cc \ + logging/log_buffer.cc \ memory/arena.cc \ memory/concurrent_arena.cc \ memory/jemalloc_nodump_allocator.cc \ @@ -139,7 +142,6 @@ LIB_SOURCES = \ test_util/sync_point_impl.cc \ test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ - util/auto_roll_logger.cc \ util/bloom.cc \ util/build_version.cc \ util/coding.cc \ @@ -149,11 +151,9 @@ LIB_SOURCES = \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ util/dynamic_bloom.cc \ - util/event_logger.cc \ util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ - util/log_buffer.cc \ util/murmurhash.cc \ util/random.cc \ util/rate_limiter.cc \ @@ -340,6 +340,8 @@ MAIN_SOURCES = \ env/env_basic_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ + logging/auto_roll_logger_test.cc \ + logging/event_logger_test.cc \ memory/arena_test.cc \ memtable/inlineskiplist_test.cc \ memtable/memtablerep_bench.cc \ @@ -369,13 +371,11 @@ MAIN_SOURCES = \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ tools/trace_analyzer_test.cc \ - util/auto_roll_logger_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ util/coding_test.cc \ util/crc32c_test.cc \ util/dynamic_bloom_test.cc \ - util/event_logger_test.cc \ util/filelock_test.cc \ util/log_write_bench.cc \ util/rate_limiter_test.cc \ diff --git a/table/block_based/block.cc b/table/block_based/block.cc index dfc4aa3c679..6c7e46d5969 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -15,6 +15,7 @@ #include #include +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" @@ -23,7 +24,6 @@ #include "table/block_based/data_block_footer.h" #include "table/format.h" #include "util/coding.h" -#include "util/logging.h" namespace rocksdb { diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 9a1a4d526f1..70e5bbd3bbd 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -11,11 +11,11 @@ #include "table/full_filter_bits_builder.h" #include "index_builder.h" +#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" #include "util/hash.h" -#include "util/logging.h" namespace rocksdb { diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 263abbfcf80..72b567fc23d 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -12,6 +12,7 @@ #include #include +#include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" @@ -24,7 +25,6 @@ #include "util/compression.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" diff --git a/table/format.cc b/table/format.cc index 3f95fd4d44b..a4441fe5646 100644 --- a/table/format.cc +++ b/table/format.cc @@ -13,6 +13,7 @@ #include #include "block_fetcher.h" +#include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" @@ -24,7 +25,6 @@ #include "util/compression.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc index 14d39065182..3a7d9e97f50 100644 --- a/test_util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -24,7 +24,7 @@ #include "db/dbformat.h" #include "db/snapshot_impl.h" -#include "util/logging.h" +#include "logging/logging.h" #include "util/random.h" #include "util/string_util.h" diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 0c828deb165..dc8f8152376 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -47,6 +47,7 @@ int main() { #include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "hdfs/env_hdfs.h" +#include "logging/logging.h" #include "monitoring/histogram.h" #include "options/options_helper.h" #include "port/port.h" @@ -66,7 +67,6 @@ int main() { #include "util/compression.h" #include "util/crc32c.h" #include "util/gflags_compat.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/string_util.h" diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 7a13728308c..5e61f31ba60 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -17,13 +17,13 @@ int main() { #include +#include "logging/logging.h" #include "memory/arena.h" #include "rocksdb/filter_policy.h" #include "table/full_filter_bits_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/gflags_compat.h" -#include "util/logging.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/util/comparator.cc b/util/comparator.cc index b42c23725fc..eab17ebccf3 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -7,13 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/comparator.h" +#include #include #include -#include -#include "rocksdb/comparator.h" -#include "rocksdb/slice.h" +#include "logging/logging.h" #include "port/port.h" -#include "util/logging.h" +#include "rocksdb/slice.h" namespace rocksdb { diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 036e0128008..7ca8bb891aa 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -24,12 +24,12 @@ int main() { #include #include "dynamic_bloom.h" +#include "logging/logging.h" #include "memory/arena.h" #include "port/port.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/gflags_compat.h" -#include "util/logging.h" #include "util/stop_watch.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index dcd88ffdb8c..7a2e1940316 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -11,6 +11,7 @@ #include "rocksdb/utilities/backupable_db.h" #include "file/filename.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/transaction_log.h" @@ -19,7 +20,6 @@ #include "util/coding.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/string_util.h" #include "utilities/checkpoint/checkpoint_impl.h" diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 7f447a04ad0..25583fa981a 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -16,6 +16,7 @@ #include "file/file_util.h" #include "file/filename.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" #include "monitoring/instrumented_mutex.h" #include "monitoring/statistics.h" #include "rocksdb/convenience.h" @@ -31,7 +32,6 @@ #include "util/cast_util.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc index 16b9ff826e6..e74396a33d6 100644 --- a/utilities/blob_db/blob_db_impl_filesnapshot.cc +++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -8,7 +8,7 @@ #include "utilities/blob_db/blob_db_impl.h" #include "file/filename.h" -#include "util/logging.h" +#include "logging/logging.h" #include "util/mutexlock.h" // BlobDBImpl methods to get snapshot of files, e.g. for replication. diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 4475772d8d1..03cff7834b9 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -21,7 +21,7 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "file/filename.h" -#include "util/logging.h" +#include "logging/logging.h" #include "utilities/blob_db/blob_db_impl.h" namespace rocksdb { diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index b998e1b8e4e..e71ecfd9a5b 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -5,11 +5,11 @@ #include +#include "logging/logging.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "util/coding.h" -#include "util/logging.h" #include "utilities/merge_operators.h" using namespace rocksdb; diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index 5baf64772cc..2169f906955 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -10,9 +10,9 @@ #include #include +#include "logging/logging.h" #include "port/port.h" #include "test_util/sync_point.h" -#include "util/logging.h" #include "util/stop_watch.h" #include "utilities/persistent_cache/block_cache_tier_file.h" diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index ce6335fb586..0fb17b369e3 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -13,9 +13,9 @@ #include #include +#include "logging/logging.h" #include "port/port.h" #include "util/crc32c.h" -#include "util/logging.h" namespace rocksdb { diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 4f075d0d9fc..5e1af2fb1f5 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -9,6 +9,7 @@ #include #include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/utilities/optimistic_transaction_db.h" @@ -16,7 +17,6 @@ #include "test_util/testharness.h" #include "test_util/transaction_test_util.h" #include "util/crc32c.h" -#include "util/logging.h" #include "util/random.h" using std::string; From 349db9049732ad1f6c7466483b4e79c8817730dd Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 3 Jun 2019 12:31:45 -0700 Subject: [PATCH 101/572] Make GetEntryFromCache a member function. (#5394) Summary: The commit makes GetEntryFromCache become a member function. It also makes all its callers become member functions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5394 Differential Revision: D15579222 Pulled By: HaoyuHuang fbshipit-source-id: 07509c42ee9022dcded54950012bd3bd562aa1ae --- table/block_based/block_based_table_reader.cc | 513 +++++++++--------- table/block_based/block_based_table_reader.h | 112 ++-- 2 files changed, 311 insertions(+), 314 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index b7fba779f47..2fdaf2afd2a 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -17,8 +17,6 @@ #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" -#include "table/block_fetcher.h" -#include "table/meta_blocks.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -29,6 +27,7 @@ #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" + #include "table/block_based/block.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" @@ -36,9 +35,11 @@ #include "table/block_based/filter_block.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/partitioned_filter_block.h" +#include "table/block_fetcher.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" +#include "table/meta_blocks.h" #include "table/multiget_context.h" #include "table/persistent_cache_helper.h" #include "table/sst_file_writer_collectors.h" @@ -128,51 +129,6 @@ void ForceReleaseCachedEntry(void* arg, void* h) { cache->Release(handle, true /* force_erase */); } -Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, - int level, Tickers block_cache_miss_ticker, - Tickers block_cache_hit_ticker, - uint64_t* block_cache_miss_stats, - uint64_t* block_cache_hit_stats, - Statistics* statistics, - GetContext* get_context) { - auto cache_handle = block_cache->Lookup(key, statistics); - if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, - static_cast(level)); - if (get_context != nullptr) { - // overall cache hit - get_context->get_context_stats_.num_cache_hit++; - // total bytes read from cache - get_context->get_context_stats_.num_cache_bytes_read += - block_cache->GetUsage(cache_handle); - // block-type specific cache hit - (*block_cache_hit_stats)++; - } else { - // overall cache hit - RecordTick(statistics, BLOCK_CACHE_HIT); - // total bytes read from cache - RecordTick(statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(cache_handle)); - RecordTick(statistics, block_cache_hit_ticker); - } - } else { - PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, - static_cast(level)); - if (get_context != nullptr) { - // overall cache miss - get_context->get_context_stats_.num_cache_miss++; - // block-type specific cache miss - (*block_cache_miss_stats)++; - } else { - RecordTick(statistics, BLOCK_CACHE_MISS); - RecordTick(statistics, block_cache_miss_ticker); - } - } - - return cache_handle; -} - // For hash based index, return true if prefix_extractor and // prefix_extractor_block mismatch, false otherwise. This flag will be used // as total_order_seek via NewIndexIterator @@ -275,8 +231,8 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( assert(rep != nullptr); constexpr bool is_index = true; - const Status s = BlockBasedTable::RetrieveBlock( - prefetch_buffer, rep, read_options, rep->footer.index_handle(), + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->footer.index_handle(), UncompressionDict::GetEmptyDict(), index_block, is_index, get_context); return s; @@ -446,10 +402,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { const bool is_index = true; // TODO: Support counter batch update for partitioned index and // filter blocks - s = BlockBasedTable::MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), rep, ro, handle, - UncompressionDict::GetEmptyDict(), &block, is_index, - nullptr /* get_context */); + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + &block, is_index, nullptr /* get_context */); assert(s.ok() || block.GetValue() == nullptr); if (s.ok() && block.GetValue() != nullptr) { @@ -707,6 +662,49 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { std::unique_ptr prefix_index_; }; +Cache::Handle* BlockBasedTable::GetEntryFromCache( + Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, uint64_t* block_cache_miss_stats, + uint64_t* block_cache_hit_stats, Statistics* statistics, + GetContext* get_context) const { + auto cache_handle = block_cache->Lookup(key, statistics); + if (cache_handle != nullptr) { + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast(rep_->level)); + if (get_context != nullptr) { + // overall cache hit + get_context->get_context_stats_.num_cache_hit++; + // total bytes read from cache + get_context->get_context_stats_.num_cache_bytes_read += + block_cache->GetUsage(cache_handle); + // block-type specific cache hit + (*block_cache_hit_stats)++; + } else { + // overall cache hit + RecordTick(statistics, BLOCK_CACHE_HIT); + // total bytes read from cache + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, + block_cache->GetUsage(cache_handle)); + RecordTick(statistics, block_cache_hit_ticker); + } + } else { + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast(rep_->level)); + if (get_context != nullptr) { + // overall cache miss + get_context->get_context_stats_.num_cache_miss++; + // block-type specific cache miss + (*block_cache_miss_stats)++; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + RecordTick(statistics, block_cache_miss_ticker); + } + } + + return cache_handle; +} + // Helper function to setup the cache key's prefix for the Table. void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { assert(kMaxCacheKeyPrefixSize >= 10); @@ -938,24 +936,24 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Read metaindex std::unique_ptr meta; std::unique_ptr meta_iter; - s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter); + s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter); if (!s.ok()) { return s; } - s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(), - largest_seqno); + s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(), + largest_seqno); if (!s.ok()) { return s; } - s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(), - internal_comparator); + s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(), + internal_comparator); if (!s.ok()) { return s; } - s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(), - new_table.get(), prefetch_all, table_options, - level); + s = new_table->PrefetchIndexAndFilterBlocks( + prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all, + table_options, level); if (s.ok()) { // Update tail prefetch stats @@ -1043,7 +1041,7 @@ Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, } Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, + FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, TableProperties** table_properties) { assert(table_properties != nullptr); // If this is an external SST file ingested with write_global_seqno set to @@ -1054,8 +1052,8 @@ Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( // original value, i.e. 0, and verify the checksum again. BlockHandle props_block_handle; CacheAllocationPtr tmp_buf; - Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer, - rep->footer, rep->ioptions, table_properties, + Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer, + rep_->footer, rep_->ioptions, table_properties, false /* verify_checksum */, &props_block_handle, &tmp_buf, false /* compression_type_missing */, nullptr /* memory_allocator */); @@ -1071,21 +1069,21 @@ Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); } uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); - s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(), + s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(), block_size + 1, value); } return s; } Status BlockBasedTable::ReadPropertiesBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, const SequenceNumber largest_seqno) { bool found_properties_block = true; Status s; s = SeekToPropertiesBlock(meta_iter, &found_properties_block); if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.info_log, "Error when seeking to properties block from file: %s", s.ToString().c_str()); } else if (found_properties_block) { @@ -1093,15 +1091,15 @@ Status BlockBasedTable::ReadPropertiesBlock( TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties( - meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer, - rep->ioptions, &table_properties, true /* verify_checksum */, + meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, true /* verify_checksum */, nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, false /* compression_type_missing */, nullptr /* memory_allocator */); } if (s.IsCorruption()) { - s = TryReadPropertiesWithGlobalSeqno( - rep, prefetch_buffer, meta_iter->value(), &table_properties); + s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(), + &table_properties); } std::unique_ptr props_guard; if (table_properties != nullptr) { @@ -1109,53 +1107,55 @@ Status BlockBasedTable::ReadPropertiesBlock( } if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.info_log, "Encountered error while reading data from properties " "block %s", s.ToString().c_str()); } else { assert(table_properties != nullptr); - rep->table_properties.reset(props_guard.release()); - rep->blocks_maybe_compressed = rep->table_properties->compression_name != - CompressionTypeToString(kNoCompression); - rep->blocks_definitely_zstd_compressed = - (rep->table_properties->compression_name == + rep_->table_properties.reset(props_guard.release()); + rep_->blocks_maybe_compressed = + rep_->table_properties->compression_name != + CompressionTypeToString(kNoCompression); + rep_->blocks_definitely_zstd_compressed = + (rep_->table_properties->compression_name == CompressionTypeToString(kZSTD) || - rep->table_properties->compression_name == + rep_->table_properties->compression_name == CompressionTypeToString(kZSTDNotFinalCompression)); } } else { - ROCKS_LOG_ERROR(rep->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.info_log, "Cannot find Properties block from file."); } #ifndef ROCKSDB_LITE - if (rep->table_properties) { - ParseSliceTransform(rep->table_properties->prefix_extractor_name, - &(rep->table_prefix_extractor)); + if (rep_->table_properties) { + ParseSliceTransform(rep_->table_properties->prefix_extractor_name, + &(rep_->table_prefix_extractor)); } #endif // ROCKSDB_LITE // Read the table properties, if provided. - if (rep->table_properties) { - rep->whole_key_filtering &= - IsFeatureSupported(*(rep->table_properties), + if (rep_->table_properties) { + rep_->whole_key_filtering &= + IsFeatureSupported(*(rep_->table_properties), BlockBasedTablePropertyNames::kWholeKeyFiltering, - rep->ioptions.info_log); - rep->prefix_filtering &= IsFeatureSupported( - *(rep->table_properties), - BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log); - - s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno, - &(rep->global_seqno)); + rep_->ioptions.info_log); + rep_->prefix_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, + rep_->ioptions.info_log); + + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, + &(rep_->global_seqno)); if (!s.ok()) { - ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str()); + ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str()); } } return s; } Status BlockBasedTable::ReadRangeDelBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator) { Status s; bool found_range_del_block; @@ -1163,13 +1163,13 @@ Status BlockBasedTable::ReadRangeDelBlock( s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); if (!s.ok()) { ROCKS_LOG_WARN( - rep->ioptions.info_log, + rep_->ioptions.info_log, "Error when seeking to range delete tombstones block from file: %s", s.ToString().c_str()); } else if (found_range_del_block && !range_del_handle.IsNull()) { ReadOptions read_options; std::unique_ptr iter(NewDataBlockIterator( - rep, read_options, range_del_handle, nullptr /* input_iter */, + read_options, range_del_handle, nullptr /* input_iter */, false /* is_index */, true /* key_includes_seq */, true /* index_key_is_full */, nullptr /* get_context */, Status(), prefetch_buffer)); @@ -1177,11 +1177,11 @@ Status BlockBasedTable::ReadRangeDelBlock( s = iter->status(); if (!s.ok()) { ROCKS_LOG_WARN( - rep->ioptions.info_log, + rep_->ioptions.info_log, "Encountered error while reading data from range del block %s", s.ToString().c_str()); } else { - rep->fragmented_range_dels = + rep_->fragmented_range_dels = std::make_shared(std::move(iter), internal_comparator); } @@ -1190,25 +1190,25 @@ Status BlockBasedTable::ReadRangeDelBlock( } Status BlockBasedTable::ReadCompressionDictBlock( - const Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block) { + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* compression_dict_block) const { assert(compression_dict_block != nullptr); Status s; - if (!rep->compression_dict_handle.IsNull()) { + if (!rep_->compression_dict_handle.IsNull()) { std::unique_ptr compression_dict_cont{new BlockContents()}; PersistentCacheOptions cache_options; ReadOptions read_options; read_options.verify_checksums = true; BlockFetcher compression_block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, read_options, - rep->compression_dict_handle, compression_dict_cont.get(), - rep->ioptions, false /* decompress */, false /*maybe_compressed*/, + rep_->file.get(), prefetch_buffer, rep_->footer, read_options, + rep_->compression_dict_handle, compression_dict_cont.get(), + rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), cache_options); s = compression_block_fetcher.ReadBlockContents(); if (!s.ok()) { ROCKS_LOG_WARN( - rep->ioptions.info_log, + rep_->ioptions.info_log, "Encountered error while reading data from compression dictionary " "block %s", s.ToString().c_str()); @@ -1220,13 +1220,13 @@ Status BlockBasedTable::ReadCompressionDictBlock( } Status BlockBasedTable::PrefetchIndexAndFilterBlocks( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, const BlockBasedTableOptions& table_options, const int level) { Status s; // Find filter handle and filter type - if (rep->filter_policy) { + if (rep_->filter_policy) { for (auto filter_type : {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, Rep::FilterType::kBlockFilter}) { @@ -1245,10 +1245,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( assert(0); } std::string filter_block_key = prefix; - filter_block_key.append(rep->filter_policy->Name()); - if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle) + filter_block_key.append(rep_->filter_policy->Name()); + if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) .ok()) { - rep->filter_type = filter_type; + rep_->filter_type = filter_type; break; } } @@ -1258,7 +1258,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // Find compression dictionary handle bool found_compression_dict; s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, - &rep->compression_dict_handle); + &rep_->compression_dict_handle); } BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); @@ -1272,13 +1272,14 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // prefetch the first level of filter const bool prefetch_filter = - prefetch_all || (table_options.pin_top_level_index_and_filter && - rep->filter_type == Rep::FilterType::kPartitionedFilter); + prefetch_all || + (table_options.pin_top_level_index_and_filter && + rep_->filter_type == Rep::FilterType::kPartitionedFilter); // Partition fitlers cannot be enabled without partition indexes assert(!prefetch_filter || prefetch_index); // pin both index and filters, down to all partitions const bool pin_all = - rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; // pin the first level of index const bool pin_index = pin_all || (table_options.pin_top_level_index_and_filter && @@ -1286,7 +1287,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // pin the first level of filter const bool pin_filter = pin_all || (table_options.pin_top_level_index_and_filter && - rep->filter_type == Rep::FilterType::kPartitionedFilter); + rep_->filter_type == Rep::FilterType::kPartitionedFilter); IndexReader* index_reader = nullptr; if (s.ok()) { @@ -1294,12 +1295,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( prefetch_index, pin_index, &index_reader); if (s.ok()) { assert(index_reader != nullptr); - rep->index_reader.reset(index_reader); + rep_->index_reader.reset(index_reader); // The partitions of partitioned index are always stored in cache. They // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_all) { - rep->index_reader->CacheDependencies(pin_all); + rep_->index_reader->CacheDependencies(pin_all); } } else { delete index_reader; @@ -1318,43 +1319,43 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (s.ok() && prefetch_filter) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = - new_table->GetFilter(rep->table_prefix_extractor.get()); + new_table->GetFilter(rep_->table_prefix_extractor.get()); if (filter_entry.GetValue() != nullptr && prefetch_all) { filter_entry.GetValue()->CacheDependencies( - pin_all, rep->table_prefix_extractor.get()); + pin_all, rep_->table_prefix_extractor.get()); } // if pin_filter is true then save it in rep_->filter_entry; it will be // released in the destructor only, hence it will be pinned in the // cache while this reader is alive if (pin_filter) { - rep->filter_entry = std::move(filter_entry); + rep_->filter_entry = std::move(filter_entry); } } } else { std::unique_ptr compression_dict_block; if (s.ok()) { // Set filter block - if (rep->filter_policy) { + if (rep_->filter_policy) { const bool is_a_filter_partition = true; - auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle, - !is_a_filter_partition, - rep->table_prefix_extractor.get()); - rep->filter.reset(filter); + auto filter = new_table->ReadFilter( + prefetch_buffer, rep_->filter_handle, !is_a_filter_partition, + rep_->table_prefix_extractor.get()); + rep_->filter.reset(filter); // Refer to the comment above about paritioned indexes always being // cached if (filter && prefetch_all) { - filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); + filter->CacheDependencies(pin_all, + rep_->table_prefix_extractor.get()); } } - s = ReadCompressionDictBlock(rep, prefetch_buffer, - &compression_dict_block); + s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); } - if (s.ok() && !rep->compression_dict_handle.IsNull()) { + if (s.ok() && !rep_->compression_dict_handle.IsNull()) { assert(compression_dict_block != nullptr); // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - rep->uncompression_dict.reset(new UncompressionDict( + rep_->uncompression_dict.reset(new UncompressionDict( compression_dict_block->data.ToString(), - rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics)); + rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); } } return s; @@ -1399,23 +1400,22 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const { // Load the meta-block from the file. On success, return the loaded meta block // and its iterator. -Status BlockBasedTable::ReadMetaBlock(Rep* rep, - FilePrefetchBuffer* prefetch_buffer, +Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* meta_block, std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. std::unique_ptr meta; Status s = ReadBlockFromFile( - rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), - rep->footer.metaindex_handle(), &meta, rep->ioptions, + rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(), + rep_->footer.metaindex_handle(), &meta, rep_->ioptions, true /* decompress */, true /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, - GetMemoryAllocator(rep->table_options)); + GetMemoryAllocator(rep_->table_options)); if (!s.ok()) { - ROCKS_LOG_ERROR(rep->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.info_log, "Encountered error while reading data from properties" " block %s", s.ToString().c_str()); @@ -1431,22 +1431,24 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, const Rep* rep, + Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, - bool is_index, GetContext* get_context) { + const UncompressionDict& uncompression_dict, bool is_index, + GetContext* get_context) const { + const size_t read_amp_bytes_per_bit = + !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0; assert(block); assert(block->IsEmpty()); Status s; BlockContents* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; - Statistics* statistics = rep->ioptions.statistics; + Statistics* statistics = rep_->ioptions.statistics; // Lookup uncompressed cache first if (block_cache != nullptr) { auto cache_handle = GetEntryFromCache( - block_cache, block_cache_key, rep->level, + block_cache, block_cache_key, is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, get_context @@ -1498,16 +1500,16 @@ Status BlockBasedTable::GetDataBlockFromCache( BlockContents contents; UncompressionContext context(compression_type); UncompressionInfo info(context, uncompression_dict, compression_type); - s = UncompressBlockContents(info, compressed_block->data.data(), - compressed_block->data.size(), &contents, - rep->table_options.format_version, rep->ioptions, - GetMemoryAllocator(rep->table_options)); + s = UncompressBlockContents( + info, compressed_block->data.data(), compressed_block->data.size(), + &contents, rep_->table_options.format_version, rep_->ioptions, + GetMemoryAllocator(rep_->table_options)); // Insert uncompressed block into block cache if (s.ok()) { std::unique_ptr block_holder( - new Block(std::move(contents), rep->get_global_seqno(is_index), - read_amp_bytes_per_bit, statistics)); // uncompressed block + new Block(std::move(contents), rep_->get_global_seqno(is_index), + read_amp_bytes_per_bit, statistics)); // uncompressed block if (block_cache != nullptr && block_holder->own_bytes() && read_options.fill_cache) { @@ -1566,13 +1568,20 @@ Status BlockBasedTable::GetDataBlockFromCache( Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions, CachableEntry* cached_block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, uint32_t format_version, + CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, - bool is_index, Cache::Priority priority, GetContext* get_context) { - + MemoryAllocator* memory_allocator, bool is_index, + GetContext* get_context) const { + const ImmutableCFOptions& ioptions = rep_->ioptions; + const uint32_t format_version = rep_->table_options.format_version; + const size_t read_amp_bytes_per_bit = + !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0; + const Cache::Priority priority = + is_index && rep_->table_options + .cache_index_and_filter_blocks_with_high_priority + ? Cache::Priority::HIGH + : Cache::Priority::LOW; assert(cached_block); assert(cached_block->IsEmpty()); assert(raw_block_comp_type == kNoCompression || @@ -1791,8 +1800,7 @@ CachableEntry BlockBasedTable::GetFilter( Statistics* statistics = rep_->ioptions.statistics; Cache::Handle* cache_handle = GetEntryFromCache( - block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS, - BLOCK_CACHE_FILTER_HIT, + block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, get_context ? &get_context->get_context_stats_.num_cache_filter_miss : nullptr, get_context ? &get_context->get_context_stats_.num_cache_filter_hit @@ -1843,25 +1851,24 @@ CachableEntry BlockBasedTable::GetFilter( false /* own_value */}; } -CachableEntry -BlockBasedTable::GetUncompressionDict(const Rep* rep, - FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context) { - if (!rep->table_options.cache_index_and_filter_blocks) { +CachableEntry BlockBasedTable::GetUncompressionDict( + FilePrefetchBuffer* prefetch_buffer, bool no_io, + GetContext* get_context) const { + if (!rep_->table_options.cache_index_and_filter_blocks) { // block cache is either disabled or not used for meta-blocks. In either // case, BlockBasedTableReader is the owner of the uncompression dictionary. - return {rep->uncompression_dict.get(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; + return {rep_->uncompression_dict.get(), nullptr /* cache */, + nullptr /* cache_handle */, false /* own_value */}; } - if (rep->compression_dict_handle.IsNull()) { + if (rep_->compression_dict_handle.IsNull()) { return CachableEntry(); } char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto cache_key = - GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, - rep->compression_dict_handle, cache_key_buf); + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->compression_dict_handle, cache_key_buf); auto cache_handle = GetEntryFromCache( - rep->table_options.block_cache.get(), cache_key, rep->level, + rep_->table_options.block_cache.get(), cache_key, BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT, get_context ? &get_context->get_context_stats_.num_cache_compression_dict_miss @@ -1869,29 +1876,29 @@ BlockBasedTable::GetUncompressionDict(const Rep* rep, get_context ? &get_context->get_context_stats_.num_cache_compression_dict_hit : nullptr, - rep->ioptions.statistics, get_context); + rep_->ioptions.statistics, get_context); UncompressionDict* dict = nullptr; if (cache_handle != nullptr) { dict = reinterpret_cast( - rep->table_options.block_cache->Value(cache_handle)); + rep_->table_options.block_cache->Value(cache_handle)); } else if (no_io) { // Do not invoke any io. } else { std::unique_ptr compression_dict_block; Status s = - ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block); + ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); size_t usage = 0; if (s.ok()) { assert(compression_dict_block != nullptr); // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy dict = new UncompressionDict(compression_dict_block->data.ToString(), - rep->blocks_definitely_zstd_compressed, - rep->ioptions.statistics); + rep_->blocks_definitely_zstd_compressed, + rep_->ioptions.statistics); usage = dict->ApproximateMemoryUsage(); - s = rep->table_options.block_cache->Insert( + s = rep_->table_options.block_cache->Insert( cache_key, dict, usage, &DeleteCachedUncompressionDictEntry, &cache_handle, - rep->table_options.cache_index_and_filter_blocks_with_high_priority + rep_->table_options.cache_index_and_filter_blocks_with_high_priority ? Cache::Priority::HIGH : Cache::Priority::LOW); } @@ -1904,23 +1911,23 @@ BlockBasedTable::GetUncompressionDict(const Rep* rep, get_context->get_context_stats_ .num_cache_compression_dict_bytes_insert += usage; } else { - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); - RecordTick(rep->ioptions.statistics, + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage); } } else { // There should be no way to get here if block cache insertion succeeded. // Though it is still possible something failed earlier. - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); delete dict; dict = nullptr; assert(cache_handle == nullptr); } } - return {dict, cache_handle ? rep->table_options.block_cache.get() : nullptr, - cache_handle, false /* own_value */}; + return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, + cache_handle, false /* own_value */}; } // disable_prefix_seek should be set to true when prefix_extractor found in SST @@ -1943,10 +1950,10 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( // If input_iter is not null, update this iter and return it template TBlockIter* BlockBasedTable::NewDataBlockIterator( - const Rep* rep, const ReadOptions& ro, const BlockHandle& handle, - TBlockIter* input_iter, bool is_index, bool key_includes_seq, - bool index_key_is_full, GetContext* get_context, Status s, - FilePrefetchBuffer* prefetch_buffer) { + const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, + bool is_index, bool key_includes_seq, bool index_key_is_full, + GetContext* get_context, Status s, + FilePrefetchBuffer* prefetch_buffer) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; @@ -1957,15 +1964,15 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool no_io = (ro.read_tier == kBlockCacheTier); auto uncompression_dict_storage = - GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); + GetUncompressionDict(prefetch_buffer, no_io, get_context); const UncompressionDict& uncompression_dict = uncompression_dict_storage.GetValue() == nullptr ? UncompressionDict::GetEmptyDict() : *uncompression_dict_storage.GetValue(); CachableEntry block; - s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict, - &block, is_index, get_context); + s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, + is_index, get_context); if (!s.ok()) { assert(block.IsEmpty()); @@ -1984,16 +1991,16 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // Otherwise, the block is pinned iff the source is immortal. const bool block_contents_pinned = block.IsCached() || - (!block.GetValue()->own_bytes() && rep->immortal_table); + (!block.GetValue()->own_bytes() && rep_->immortal_table); iter = block.GetValue()->NewIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + &rep_->internal_comparator, rep_->internal_comparator.user_comparator(), + iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq, index_key_is_full, block_contents_pinned); if (!block.IsCached()) { - if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { // insert a dummy record to block cache to track the memory usage - Cache* const block_cache = rep->table_options.block_cache.get(); + Cache* const block_cache = rep_->table_options.block_cache.get(); Cache::Handle* cache_handle = nullptr; // There are two other types of cache keys: 1) SST cache key added in // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in @@ -2002,11 +2009,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // differentiate from `write_buffer_manager` const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; - // Prefix: use rep->cache_key_prefix padded by 0s + // Prefix: use rep_->cache_key_prefix padded by 0s memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); - assert(rep->cache_key_prefix_size != 0); - assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix); - memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, next_cache_key_id_++); assert(end - cache_key <= @@ -2028,17 +2035,18 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } Status BlockBasedTable::MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, GetContext* get_context) { + CachableEntry* block_entry, bool is_index, + GetContext* get_context) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); - Cache* block_cache = rep->table_options.block_cache.get(); + Cache* block_cache = rep_->table_options.block_cache.get(); // No point to cache compressed blocks if it never goes away Cache* block_cache_compressed = - rep->immortal_table ? nullptr - : rep->table_options.block_cache_compressed.get(); + rep_->immortal_table ? nullptr + : rep_->table_options.block_cache_compressed.get(); // First, try to get the block from the cache // @@ -2051,58 +2059,50 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( if (block_cache != nullptr || block_cache_compressed != nullptr) { // create key for block cache if (block_cache != nullptr) { - key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, + key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, cache_key); } if (block_cache_compressed != nullptr) { - ckey = GetCacheKey(rep->compressed_cache_key_prefix, - rep->compressed_cache_key_prefix_size, handle, + ckey = GetCacheKey(rep_->compressed_cache_key_prefix, + rep_->compressed_cache_key_prefix_size, handle, compressed_cache_key); } - s = GetDataBlockFromCache( - key, ckey, block_cache, block_cache_compressed, rep, ro, block_entry, - uncompression_dict, - !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, is_index, - get_context); + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + ro, block_entry, uncompression_dict, is_index, + get_context); // Can't find the block from the cache. If I/O is allowed, read from the // file. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { - Statistics* statistics = rep->ioptions.statistics; + Statistics* statistics = rep_->ioptions.statistics; bool do_decompress = - block_cache_compressed == nullptr && rep->blocks_maybe_compressed; + block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; CompressionType raw_block_comp_type; BlockContents raw_block_contents; { - StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); BlockFetcher block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, ro, handle, - &raw_block_contents, rep->ioptions, - do_decompress /* do uncompress */, rep->blocks_maybe_compressed, - uncompression_dict, rep->persistent_cache_options, - GetMemoryAllocator(rep->table_options), - GetMemoryAllocatorForCompressedBlock(rep->table_options)); + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, + &raw_block_contents, rep_->ioptions, + do_decompress /* do uncompress */, rep_->blocks_maybe_compressed, + uncompression_dict, rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), + GetMemoryAllocatorForCompressedBlock(rep_->table_options)); s = block_fetcher.ReadBlockContents(); raw_block_comp_type = block_fetcher.get_compression_type(); } if (s.ok()) { - SequenceNumber seq_no = rep->get_global_seqno(is_index); + SequenceNumber seq_no = rep_->get_global_seqno(is_index); // If filling cache is allowed and a cache is configured, try to put the // block to the cache. - s = PutDataBlockToCache( - key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, - block_entry, &raw_block_contents, raw_block_comp_type, - rep->table_options.format_version, uncompression_dict, seq_no, - !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, - GetMemoryAllocator(rep->table_options), is_index, - is_index && rep->table_options - .cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW, - get_context); + s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, + block_entry, &raw_block_contents, + raw_block_comp_type, uncompression_dict, seq_no, + GetMemoryAllocator(rep_->table_options), + is_index, get_context); } } } @@ -2111,16 +2111,16 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } Status BlockBasedTable::RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, GetContext* get_context) { - assert(rep); + CachableEntry* block_entry, bool is_index, + GetContext* get_context) const { assert(block_entry); assert(block_entry->IsEmpty()); Status s; - if (!is_index || rep->table_options.cache_index_and_filter_blocks) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, + if (!is_index || rep_->table_options.cache_index_and_filter_blocks) { + s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, uncompression_dict, block_entry, is_index, get_context); @@ -2144,15 +2144,15 @@ Status BlockBasedTable::RetrieveBlock( std::unique_ptr block; { - StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, + StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile( - rep->file.get(), prefetch_buffer, rep->footer, ro, handle, &block, - rep->ioptions, rep->blocks_maybe_compressed, - rep->blocks_maybe_compressed, uncompression_dict, - rep->persistent_cache_options, rep->get_global_seqno(is_index), - !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, - GetMemoryAllocator(rep->table_options)); + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, + rep_->ioptions, rep_->blocks_maybe_compressed, + rep_->blocks_maybe_compressed, uncompression_dict, + rep_->persistent_cache_options, rep_->get_global_seqno(is_index), + !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0, + GetMemoryAllocator(rep_->table_options)); } if (!s.ok()) { @@ -2530,8 +2530,8 @@ void BlockBasedTableIterator::InitDataBlock() { } Status s; - BlockBasedTable::NewDataBlockIterator( - rep, read_options_, data_block_handle, &block_iter_, is_index_, + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, is_index_, key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; @@ -2775,7 +2775,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } else { DataBlockIter biter; NewDataBlockIterator( - rep_, read_options, iiter->value(), &biter, false, + read_options, iiter->value(), &biter, false, true /* key_includes_seq */, true /* index_key_is_full */, get_context); @@ -2886,7 +2886,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { DataBlockIter biter; NewDataBlockIterator( - rep_, read_options, iiter->value(), &biter, false, + read_options, iiter->value(), &biter, false, true /* key_includes_seq */, get_context); if (read_options.read_tier == kBlockCacheTier && @@ -2989,8 +2989,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, // Load the block specified by the block_handle into the block cache DataBlockIter biter; - NewDataBlockIterator(rep_, ReadOptions(), block_handle, - &biter); + NewDataBlockIterator(ReadOptions(), block_handle, &biter); if (!biter.status().ok()) { // there was an unexpected error while pre-fetching @@ -3006,7 +3005,7 @@ Status BlockBasedTable::VerifyChecksum() { // Check Meta blocks std::unique_ptr meta; std::unique_ptr meta_iter; - s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter); + s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter); if (s.ok()) { s = VerifyChecksumInMetaBlocks(meta_iter.get()); if (!s.ok()) { @@ -3075,7 +3074,7 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( s = block_fetcher.ReadBlockContents(); if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) { TableProperties* table_properties; - s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */, + s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */, index_iter->value(), &table_properties); delete table_properties; @@ -3170,8 +3169,7 @@ Status BlockBasedTable::CreateIndexReader( std::unique_ptr meta_iter_guard; auto meta_index_iter = preloaded_meta_index_iter; if (meta_index_iter == nullptr) { - auto s = - ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard); + auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard); if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. @@ -3251,7 +3249,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - rep_, ReadOptions(), blockhandles_iter->value())); + ReadOptions(), blockhandles_iter->value())); s = datablock_iter->status(); if (!s.ok()) { @@ -3296,8 +3294,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file, "--------------------------------------\n"); std::unique_ptr meta; std::unique_ptr meta_iter; - Status s = - ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter); + Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter); if (s.ok()) { for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { s = meta_iter->status(); @@ -3387,7 +3384,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file, // Output compression dictionary if (!rep_->compression_dict_handle.IsNull()) { std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, + s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */, &compression_dict_block); if (!s.ok()) { return s; @@ -3543,7 +3540,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - rep_, ReadOptions(), blockhandles_iter->value())); + ReadOptions(), blockhandles_iter->value())); s = datablock_iter->status(); if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index f6f610ca2ac..e53248fbcba 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -219,12 +219,12 @@ class BlockBasedTable : public TableReader { // input_iter: if it is not null, update this one and return it as Iterator template - static TBlockIter* NewDataBlockIterator( - const Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, + TBlockIter* NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& block_hanlde, TBlockIter* input_iter = nullptr, bool is_index = false, bool key_includes_seq = true, bool index_key_is_full = true, GetContext* get_context = nullptr, Status s = Status(), - FilePrefetchBuffer* prefetch_buffer = nullptr); + FilePrefetchBuffer* prefetch_buffer = nullptr) const; class PartitionedIndexIteratorState; @@ -238,6 +238,14 @@ class BlockBasedTable : public TableReader { friend class MockedBlockBasedTable; static std::atomic next_cache_key_id_; + Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + uint64_t* block_cache_miss_stats, + uint64_t* block_cache_hit_stats, + Statistics* statistics, + GetContext* get_context) const; + // If block cache enabled (compressed or uncompressed), looks for the block // identified by handle in (1) uncompressed cache, (2) compressed cache, and // then (3) file. If found, inserts into the cache(s) that were searched @@ -247,22 +255,20 @@ class BlockBasedTable : public TableReader { // @param block_entry value is set to the uncompressed block if found. If // in uncompressed block cache, also sets cache_handle to reference that // block. - static Status MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, const Rep* rep, - const ReadOptions& ro, const BlockHandle& handle, - const UncompressionDict& uncompression_dict, + Status MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, bool is_index = false, - GetContext* get_context = nullptr); + GetContext* get_context = nullptr) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). - static Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, - const Rep* rep, const ReadOptions& ro, - const BlockHandle& handle, - const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, - GetContext* get_context); + Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, bool is_index, + GetContext* get_context) const; // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file @@ -276,9 +282,9 @@ class BlockBasedTable : public TableReader { const bool is_a_filter_partition, bool no_io, GetContext* get_context, const SliceTransform* prefix_extractor = nullptr) const; - static CachableEntry GetUncompressionDict( - const Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, - GetContext* get_context); + CachableEntry GetUncompressionDict( + FilePrefetchBuffer* prefetch_buffer, bool no_io, + GetContext* get_context) const; // Get the iterator from the index reader. // If input_iter is not set, return new Iterator @@ -301,13 +307,12 @@ class BlockBasedTable : public TableReader { // pointer to the block as well as its block handle. // @param uncompression_dict Data for presetting the compression library's // dictionary. - static Status GetDataBlockFromCache( + Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, const Rep* rep, + Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, - size_t read_amp_bytes_per_bit, bool is_index = false, - GetContext* get_context = nullptr); + const UncompressionDict& uncompression_dict, bool is_index = false, + GetContext* get_context = nullptr) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -319,16 +324,16 @@ class BlockBasedTable : public TableReader { // PutDataBlockToCache(). After the call, the object will be invalid. // @param uncompression_dict Data for presetting the compression library's // dictionary. - static Status PutDataBlockToCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, const ImmutableCFOptions& ioptions, - CachableEntry* block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, uint32_t format_version, - const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, - bool is_index = false, Cache::Priority pri = Cache::Priority::LOW, - GetContext* get_context = nullptr); + Status PutDataBlockToCache(const Slice& block_cache_key, + const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + CachableEntry* cached_block, + BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, + SequenceNumber seq_no, + MemoryAllocator* memory_allocator, bool is_index, + GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -336,8 +341,6 @@ class BlockBasedTable : public TableReader { friend class TableCache; friend class BlockBasedTableBuilder; - void ReadMeta(const Footer& footer); - // Figure the index type, update it in rep_, and also return it. BlockBasedTableOptions::IndexType UpdateIndexType(); @@ -365,28 +368,25 @@ class BlockBasedTable : public TableReader { TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer); - static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* meta_block, - std::unique_ptr* iter); - static Status TryReadPropertiesWithGlobalSeqno( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, - TableProperties** table_properties); - static Status ReadPropertiesBlock(Rep* rep, - FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, - const SequenceNumber largest_seqno); - static Status ReadRangeDelBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, - const InternalKeyComparator& internal_comparator); - static Status ReadCompressionDictBlock( - const Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block); - static Status PrefetchIndexAndFilterBlocks( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, BlockBasedTable* new_table, - bool prefetch_all, const BlockBasedTableOptions& table_options, - const int level); + Status ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* meta_block, + std::unique_ptr* iter); + Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer, + const Slice& handle_value, + TableProperties** table_properties); + Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const SequenceNumber largest_seqno); + Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator); + Status ReadCompressionDictBlock( + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* compression_dict_block) const; + Status PrefetchIndexAndFilterBlocks( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); From 5851cb7fdbb85a19dc0d3d9cc0a61adeb9a3ae02 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 3 Jun 2019 13:21:02 -0700 Subject: [PATCH 102/572] Move util/trace_replay.* to trace_replay/ (#5376) Summary: util/ means for lower level libraries. trace_replay is highly integrated to DB and sometimes call DB. Move it out to a separate directory. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5376 Differential Revision: D15550938 Pulled By: siying fbshipit-source-id: f46dce5ceffdc05a73f26379c7bb1b79ebe6c207 --- CMakeLists.txt | 2 +- TARGETS | 2 +- db/db_impl/db_impl.h | 2 +- db/db_iter.cc | 2 +- src.mk | 2 +- tools/trace_analyzer_test.cc | 2 +- tools/trace_analyzer_tool.cc | 2 +- tools/trace_analyzer_tool.h | 2 +- {util => trace_replay}/trace_replay.cc | 2 +- {util => trace_replay}/trace_replay.h | 0 utilities/trace/file_trace_reader_writer.cc | 2 +- 11 files changed, 10 insertions(+), 10 deletions(-) rename {util => trace_replay}/trace_replay.cc (99%) rename {util => trace_replay}/trace_replay.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b5f03a0f3b..7cb4cc7a863 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -623,6 +623,7 @@ set(SOURCES tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc + trace_replay/trace_replay.cc util/bloom.cc util/coding.cc util/compaction_job_stats_impl.cc @@ -642,7 +643,6 @@ set(SOURCES util/string_util.cc util/thread_local.cc util/threadpool_imp.cc - util/trace_replay.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_compaction_filter.cc diff --git a/TARGETS b/TARGETS index da4f4d9a61d..a635ed5ac7d 100644 --- a/TARGETS +++ b/TARGETS @@ -221,6 +221,7 @@ cpp_library( "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", + "trace_replay/trace_replay.cc", "util/bloom.cc", "util/build_version.cc", "util/coding.cc", @@ -241,7 +242,6 @@ cpp_library( "util/string_util.cc", "util/thread_local.cc", "util/threadpool_imp.cc", - "util/trace_replay.cc", "util/xxhash.cc", "utilities/backupable/backupable_db.cc", "utilities/blob_db/blob_compaction_filter.cc", diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 5461ef300aa..f73e8665fb6 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -53,12 +53,12 @@ #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" +#include "trace_replay/trace_replay.h" #include "util/autovector.h" #include "util/hash.h" #include "util/repeatable_thread.h" #include "util/stop_watch.h" #include "util/thread_local.h" -#include "util/trace_replay.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index bcfed2bb021..29a1a9eac1a 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -26,9 +26,9 @@ #include "rocksdb/options.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" +#include "trace_replay/trace_replay.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/trace_replay.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { diff --git a/src.mk b/src.mk index 38835f8c6d2..c172d0b2c2d 100644 --- a/src.mk +++ b/src.mk @@ -142,6 +142,7 @@ LIB_SOURCES = \ test_util/sync_point_impl.cc \ test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ + trace_replay/trace_replay.cc \ util/bloom.cc \ util/build_version.cc \ util/coding.cc \ @@ -162,7 +163,6 @@ LIB_SOURCES = \ util/string_util.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ - util/trace_replay.cc \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ utilities/blob_db/blob_compaction_filter.cc \ diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index 7c242f60f26..dcc954384fd 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -30,7 +30,7 @@ int main() { #include "test_util/testharness.h" #include "test_util/testutil.h" #include "tools/trace_analyzer_tool.h" -#include "util/trace_replay.h" +#include "trace_replay/trace_replay.h" namespace rocksdb { diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 93528c00608..6ab606f6a6a 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -44,13 +44,13 @@ #include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "tools/trace_analyzer_tool.h" +#include "trace_replay/trace_replay.h" #include "util/coding.h" #include "util/compression.h" #include "util/file_reader_writer.h" #include "util/gflags_compat.h" #include "util/random.h" #include "util/string_util.h" -#include "util/trace_replay.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::RegisterFlagValidator; diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h index be96f5005da..4c3b973b79c 100644 --- a/tools/trace_analyzer_tool.h +++ b/tools/trace_analyzer_tool.h @@ -16,7 +16,7 @@ #include "rocksdb/env.h" #include "rocksdb/trace_reader_writer.h" #include "rocksdb/write_batch.h" -#include "util/trace_replay.h" +#include "trace_replay/trace_replay.h" namespace rocksdb { diff --git a/util/trace_replay.cc b/trace_replay/trace_replay.cc similarity index 99% rename from util/trace_replay.cc rename to trace_replay/trace_replay.cc index 9e0e8c48cde..f9448069b80 100644 --- a/util/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/trace_replay.h" +#include "trace_replay/trace_replay.h" #include #include diff --git a/util/trace_replay.h b/trace_replay/trace_replay.h similarity index 100% rename from util/trace_replay.h rename to trace_replay/trace_replay.h diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc index 4a81516a8b7..d8e36c31276 100644 --- a/utilities/trace/file_trace_reader_writer.cc +++ b/utilities/trace/file_trace_reader_writer.cc @@ -5,9 +5,9 @@ #include "utilities/trace/file_trace_reader_writer.h" +#include "trace_replay/trace_replay.h" #include "util/coding.h" #include "util/file_reader_writer.h" -#include "util/trace_replay.h" namespace rocksdb { From ae05a83e19ff53ed0cb83e248ba19bc9f3b07a07 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 3 Jun 2019 19:47:02 -0700 Subject: [PATCH 103/572] Call ValidateOptions from SetOptions (#5368) Summary: Currently we validate options in DB::Open. However the validation step is missing when options are dynamically updated in ::SetOptions. The patch fixes that. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5368 Differential Revision: D15540101 Pulled By: maysamyabandeh fbshipit-source-id: d27bbffd8f0252d1b50bcf59e0a70a278ed937f4 --- db/column_family.cc | 49 ++++++++++++++++++- db/column_family.h | 4 ++ db/db_impl/db_impl.cc | 29 ++++++++--- db/db_impl/db_impl.h | 7 +++ db/db_impl/db_impl_open.cc | 46 +++-------------- db/db_options_test.cc | 4 +- db/db_test.cc | 3 ++ options/options_test.cc | 7 +-- test_util/testutil.cc | 15 ++++-- test_util/testutil.h | 2 +- utilities/options/options_util_test.cc | 4 +- .../transactions/write_prepared_txn_db.cc | 2 +- 12 files changed, 114 insertions(+), 58 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index ce22a00aac3..531cbeca681 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1148,13 +1148,60 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { } } +Status ColumnFamilyData::ValidateOptions( + const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { + Status s; + s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } + if (s.ok()) { + s = CheckCFPathsSupported(db_options, cf_options); + } + if (!s.ok()) { + return s; + } + + if (cf_options.ttl > 0) { + if (db_options.max_open_files != -1) { + return Status::NotSupported( + "TTL is only supported when files are always " + "kept open (set max_open_files = -1). "); + } + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "TTL is only supported in Block-Based Table format. "); + } + } + + if (cf_options.periodic_compaction_seconds > 0) { + if (db_options.max_open_files != -1) { + return Status::NotSupported( + "Periodic Compaction is only supported when files are always " + "kept open (set max_open_files = -1). "); + } + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "Periodic Compaction is only supported in " + "Block-Based Table format. "); + } + } + return s; +} + #ifndef ROCKSDB_LITE Status ColumnFamilyData::SetOptions( - const std::unordered_map& options_map) { + const DBOptions& db_options, + const std::unordered_map& options_map) { MutableCFOptions new_mutable_cf_options; Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map, ioptions_.info_log, &new_mutable_cf_options); + if (s.ok()) { + ColumnFamilyOptions cf_options = + BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options); + s = ValidateOptions(db_options, cf_options); + } if (s.ok()) { mutable_cf_options_ = new_mutable_cf_options; mutable_cf_options_.RefreshDerivedOptions(ioptions_); diff --git a/db/column_family.h b/db/column_family.h index 655cb159261..8646b4fc197 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -338,9 +338,13 @@ class ColumnFamilyData { bool is_delete_range_supported() { return is_delete_range_supported_; } + // Validate CF options against DB options + static Status ValidateOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); #ifndef ROCKSDB_LITE // REQUIRES: DB mutex held Status SetOptions( + const DBOptions& db_options, const std::unordered_map& options_map); #endif // ROCKSDB_LITE diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 9675e727dde..ba76abc2875 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -848,8 +848,9 @@ Status DBImpl::SetOptions( Status persist_options_status; SuperVersionContext sv_context(/* create_superversion */ true); { + auto db_options = GetDBOptions(); InstrumentedMutexLock l(&mutex_); - s = cfd->SetOptions(options_map); + s = cfd->SetOptions(db_options, options_map); if (s.ok()) { new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. @@ -912,6 +913,25 @@ Status DBImpl::SetDBOptions( InstrumentedMutexLock l(&mutex_); s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, &new_options); + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } if (s.ok()) { if (new_options.max_background_compactions > mutable_db_options_.max_background_compactions) { @@ -956,15 +976,12 @@ Status DBImpl::SetDBOptions( : new_options.max_open_files - 10); wal_changed = mutable_db_options_.wal_bytes_per_sync != new_options.wal_bytes_per_sync; - if (new_options.bytes_per_sync == 0) { - new_options.bytes_per_sync = 1024 * 1024; - } mutable_db_options_ = new_options; - env_options_for_compaction_ = EnvOptions( - BuildDBOptions(immutable_db_options_, mutable_db_options_)); + env_options_for_compaction_ = EnvOptions(new_db_options); env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite( env_options_for_compaction_, immutable_db_options_); versions_->ChangeEnvOptions(mutable_db_options_); + //TODO(xiez): clarify why apply optimize for read to write options env_options_for_compaction_ = env_->OptimizeForCompactionTableRead( env_options_for_compaction_, immutable_db_options_); env_options_for_compaction_.compaction_readahead_size = diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index f73e8665fb6..ab8cb11d9c9 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1501,6 +1501,13 @@ class DBImpl : public DB { Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, size_t preallocate_block_size, log::Writer** new_log); + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 5019221b5ca..2fc12746d7d 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -145,7 +145,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } namespace { - Status SanitizeOptionsByTable( const DBOptions& db_opts, const std::vector& column_families) { @@ -158,52 +157,23 @@ Status SanitizeOptionsByTable( } return Status::OK(); } +} // namespace -static Status ValidateOptions( +Status DBImpl::ValidateOptions( const DBOptions& db_options, const std::vector& column_families) { Status s; - for (auto& cfd : column_families) { - s = CheckCompressionSupported(cfd.options); - if (s.ok() && db_options.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cfd.options); - } - if (s.ok()) { - s = CheckCFPathsSupported(db_options, cfd.options); - } + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); if (!s.ok()) { return s; } - - if (cfd.options.ttl > 0) { - if (db_options.max_open_files != -1) { - return Status::NotSupported( - "TTL is only supported when files are always " - "kept open (set max_open_files = -1). "); - } - if (cfd.options.table_factory->Name() != - BlockBasedTableFactory().Name()) { - return Status::NotSupported( - "TTL is only supported in Block-Based Table format. "); - } - } - - if (cfd.options.periodic_compaction_seconds > 0) { - if (db_options.max_open_files != -1) { - return Status::NotSupported( - "Periodic Compaction is only supported when files are always " - "kept open (set max_open_files = -1). "); - } - if (cfd.options.table_factory->Name() != - BlockBasedTableFactory().Name()) { - return Status::NotSupported( - "Periodic Compaction is only supported in " - "Block-Based Table format. "); - } - } } + s = ValidateOptions(db_options); + return s; +} +Status DBImpl::ValidateOptions(const DBOptions& db_options) { if (db_options.db_paths.size() > 4) { return Status::NotSupported( "More than four DB paths are not supported yet. "); @@ -241,7 +211,7 @@ static Status ValidateOptions( return Status::OK(); } -} // namespace + Status DBImpl::NewDB() { VersionEdit new_db; new_db.SetLogNumber(0); diff --git a/db/db_options_test.cc b/db/db_options_test.cc index a9c8d218235..bf33153284e 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -66,10 +66,10 @@ class DBOptionsTest : public DBTestBase { std::unordered_map GetRandomizedMutableCFOptionsMap( Random* rnd) { - Options options; + Options options = CurrentOptions(); options.env = env_; ImmutableDBOptions db_options(options); - test::RandomInitCFOptions(&options, rnd); + test::RandomInitCFOptions(&options, options, rnd); auto sanitized_options = SanitizeOptions(db_options, options); auto opt_map = GetMutableCFOptionsMap(sanitized_options); delete options.compaction_filter; diff --git a/db/db_test.cc b/db/db_test.cc index 4c4bd382ca8..27cf790ee57 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4884,11 +4884,14 @@ TEST_F(DBTest, DynamicMiscOptions) { ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], &mutable_cf_options)); ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); + // Appveyor fails with: Compression type Snappy is not linked with the binary +#ifndef OS_WIN ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], &mutable_cf_options)); ASSERT_EQ(CompressionType::kSnappyCompression, mutable_cf_options.compression); +#endif // Test paranoid_file_checks already done in db_block_cache_test ASSERT_OK( dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); diff --git a/options/options_test.cc b/options/options_test.cc index 429b607e4f9..1aa3bace7dd 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -842,7 +842,7 @@ TEST_F(OptionsTest, OptionsComposeDecompose) { Random rnd(301); test::RandomInitDBOptions(&base_db_opts, &rnd); - test::RandomInitCFOptions(&base_cf_opts, &rnd); + test::RandomInitCFOptions(&base_cf_opts, base_db_opts, &rnd); Options base_opts(base_db_opts, base_cf_opts); DBOptions new_db_opts(base_opts); @@ -854,11 +854,12 @@ TEST_F(OptionsTest, OptionsComposeDecompose) { } TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { + Options options; ColumnFamilyOptions base_opt, new_opt; Random rnd(302); // Phase 1: randomly assign base_opt // custom type options - test::RandomInitCFOptions(&base_opt, &rnd); + test::RandomInitCFOptions(&base_opt, options, &rnd); // Phase 2: obtain a string from base_opt std::string base_options_file_content; @@ -1521,7 +1522,7 @@ TEST_F(OptionsParserTest, DumpAndParse) { for (int c = 0; c < num_cf; ++c) { ColumnFamilyOptions cf_opt; Random cf_rnd(0xFB + c); - test::RandomInitCFOptions(&cf_opt, &cf_rnd); + test::RandomInitCFOptions(&cf_opt, base_db_opt, &cf_rnd); if (c < 4) { cf_opt.prefix_extractor.reset(test::RandomSliceTransform(&rnd, c)); } diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 18e1a45bb36..4e37cde40d1 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -162,7 +162,11 @@ std::string RandomName(Random* rnd, const size_t len) { } CompressionType RandomCompressionType(Random* rnd) { - return static_cast(rnd->Uniform(6)); + auto ret = static_cast(rnd->Uniform(6)); + while (!CompressionTypeSupported(ret)) { + ret = static_cast((static_cast(ret) + 1) % 6); + } + return ret; } void RandomCompressionTypeVector(const size_t count, @@ -293,7 +297,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->stats_dump_period_sec = rnd->Uniform(100000); } -void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, + Random* rnd) { cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4)); // boolean options @@ -345,8 +350,10 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { // uint64_t options static const uint64_t uint_max = static_cast(UINT_MAX); - cf_opt->ttl = uint_max + rnd->Uniform(10000); - cf_opt->periodic_compaction_seconds = uint_max + rnd->Uniform(10000); + cf_opt->ttl = + db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0; + cf_opt->periodic_compaction_seconds = + db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0; cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000); cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000); cf_opt->max_compaction_bytes = diff --git a/test_util/testutil.h b/test_util/testutil.h index 7890ce5f511..bc0b2b07d5f 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -657,7 +657,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); // Randomly initialize the given ColumnFamilyOptions // Note that the caller is responsible for releasing non-null // cf_opt->compaction_filter. -void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd); +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd); // A dummy merge operator which can change its name class ChanglingMergeOperator : public MergeOperator { diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 5b8015152ff..8c71dbf5dc3 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -58,7 +58,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) { cf_names.push_back(i == 0 ? kDefaultColumnFamilyName : test::RandomName(&rnd_, 10)); cf_opts.emplace_back(); - test::RandomInitCFOptions(&cf_opts.back(), &rnd_); + test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_); } const std::string kFileName = "OPTIONS-123456"; @@ -82,7 +82,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) { cf_opts[i].table_factory.get(), loaded_cf_descs[i].options.table_factory.get())); } - test::RandomInitCFOptions(&cf_opts[i], &rnd_); + test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_); ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( cf_opts[i], loaded_cf_descs[i].options)); } diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index bf94d83d82b..e2a8fbbf20f 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -210,7 +210,7 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, WriteBatch empty_batch; write_options.disableWAL = true; write_options.sync = false; - const size_t ONE_BATCH = 1; // Just to inc the seq + const size_t ONE_BATCH = 1; // Just to inc the seq s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr, no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, &update_commit_map_with_prepare); From 5d6e8df1cf81213bed4c8fb27bf00bb09dc57e65 Mon Sep 17 00:00:00 2001 From: anand76 Date: Mon, 3 Jun 2019 22:37:40 -0700 Subject: [PATCH 104/572] Ignore shutdown error during compaction (#5400) Summary: The PR #5275 separated the column dropped and shutdown status codes. However, there were a couple of places in compaction where this change ended up treating a ShutdownInProgress() error as a real error and set bg_error. This caused MyRocks unit test to fail due to WAL writes during shutdown returning this error. Fix it by ignoring the shutdown status during compaction. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5400 Differential Revision: D15611680 Pulled By: anand1976 fbshipit-source-id: c602e97840e3ae24eb420d61e0ce95d3e6258632 --- db/db_compaction_test.cc | 30 ++++++++++++++++++++++++++ db/db_impl/db_impl.h | 1 + db/db_impl/db_impl_compaction_flush.cc | 6 ++++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 3051e89cd37..6537950fcc7 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -4557,6 +4557,36 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { ASSERT_EQ(num, 0); } +TEST_F(DBCompactionTest, CompactionDuringShutdown) { + Options opts = CurrentOptions(); + opts.level0_file_num_compaction_trigger = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + } + Flush(); + } + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + [&](void* /*arg*/) { + dbfull()->shutting_down_.store(true); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->error_handler_.GetBGError()); +} + // FixFileIngestionCompactionDeadlock tests and verifies that compaction and // file ingestion do not cause deadlock in the event of write stall triggered // by number of L0 files reaching level0_stop_writes_trigger. diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index ab8cb11d9c9..111a91e04f3 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1000,6 +1000,7 @@ class DBImpl : public DB { friend class DBTest_ConcurrentFlushWAL_Test; friend class DBTest_MixedSlowdownOptionsStop_Test; friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 881fa26af37..7be9b62c5d6 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1049,7 +1049,7 @@ Status DBImpl::CompactFilesImpl( if (status.ok()) { // Done - } else if (status.IsColumnFamilyDropped()) { + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, @@ -2680,6 +2680,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, compaction_job_stats, job_context->job_id); mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); compaction_job.Run(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); @@ -2713,7 +2715,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (status.ok() || status.IsCompactionTooLarge()) { // Done - } else if (status.IsColumnFamilyDropped()) { + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", From c8267120d809551d1de99f518c1a7b453fad20c0 Mon Sep 17 00:00:00 2001 From: Mark Rambacher Date: Mon, 3 Jun 2019 22:59:54 -0700 Subject: [PATCH 105/572] Add support for loading dynamic libraries into the RocksDB environment (#5281) Summary: This change adds a Dynamic Library class to the RocksDB Env. Dynamic libraries are populated via the Env::LoadLibrary method. The addition of dynamic library support allows for a few different features to be developed: 1. The compression code can be changed to use dynamic library support. This would allow RocksDB to determine at run-time what compression packages were installed. This change would eliminate the need to make sure the build-time and run-time environment had the same library set. It would also simplify some of the Java build issues (where it attempts to build and include various packages inside the RocksDB jars). 2. Along with other features (to be provided in a subsequent PR), this change would allow code/configurations to be added to RocksDB at run-time. For example, the build system includes code for building an "rados" environment and adding "Cassandra" features. Instead of these extensions being built into the base RocksDB code, these extensions could be loaded at run-time as required/appropriate, either by configuration or explicitly. We intend to push out other changes in support of the extending RocksDB at run-time via configurations. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5281 Differential Revision: D15447613 Pulled By: riversand963 fbshipit-source-id: 452cd4f54511c0bceee18f6d9d919aae9fd25fef --- .gitignore | 1 + TARGETS | 1 + buckifier/targets_cfg.py | 1 + build_tools/build_detect_platform | 13 ++++ env/env_posix.cc | 99 +++++++++++++++++++++++++++++++ env/env_test.cc | 46 ++++++++++++++ include/rocksdb/env.h | 42 +++++++++++++ 7 files changed, 203 insertions(+) diff --git a/.gitignore b/.gitignore index e88ccfc008c..6364dfdc401 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ rocksdb_undump db_test2 trace_analyzer trace_analyzer_test +.DS_Store java/out java/target diff --git a/TARGETS b/TARGETS index a635ed5ac7d..0cdd3b162f9 100644 --- a/TARGETS +++ b/TARGETS @@ -30,6 +30,7 @@ ROCKSDB_COMPILER_FLAGS = [ "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", "-DROCKSDB_BACKTRACE", "-Wnarrowing", + "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] ROCKSDB_EXTERNAL_DEPS = [ diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 730b5ebf9da..79648bb6a6d 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -35,6 +35,7 @@ "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", "-DROCKSDB_BACKTRACE", "-Wnarrowing", + "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] ROCKSDB_EXTERNAL_DEPS = [ diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 7f454bcca08..5d42faa30ae 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -602,6 +602,19 @@ EOF fi fi +if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then + $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null </dev/null + if [ "$?" = 0 ]; then + EXEC_LDFLAGS+="-ldl" + rm -f test_dl.o + fi + fi +fi + PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" diff --git a/env/env_posix.cc b/env/env_posix.cc index 7eb5b7c1451..f1a0907c9fe 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -7,8 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors #include +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include +#endif #include #include + #if defined(OS_LINUX) #include #endif @@ -69,6 +73,17 @@ #endif namespace rocksdb { +#if defined(OS_WIN) +static const std::string kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const std::string kSharedLibExt = ".dylib"; +#else +static const std::string kSharedLibExt = ".so"; +#endif +#endif namespace { @@ -115,6 +130,32 @@ int cloexec_flags(int flags, const EnvOptions* options) { return flags; } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +class PosixDynamicLibrary : public DynamicLibrary { + public: + PosixDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~PosixDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) override { + char* err = dlerror(); // Clear any old error + *func = (FunctionPtr)dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + class PosixEnv : public Env { public: PosixEnv(); @@ -729,6 +770,64 @@ class PosixEnv : public Env { return result; } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION + /** + * Loads the named library into the result. + * If the input name is empty, the current executable is loaded + * On *nix systems, a "lib" prefix is added to the name if one is not supplied + * Comparably, the appropriate shared library extension is added to the name + * if not supplied. If search_path is not specified, the shared library will + * be loaded using the default path (LD_LIBRARY_PATH) If search_path is + * specified, the shared library will be searched for in the directories + * provided by the search path + */ + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr* result) override { + Status status; + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + std::string library_name = name; + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: xs", name), dlerror()); + } +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = nullptr) override; diff --git a/env/env_test.cc b/env/env_test.cc index e8cb9b24534..30d5b528217 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -247,6 +247,52 @@ TEST_F(EnvPosixTest, MemoryMappedFileBuffer) { ASSERT_EQ(expected_data, actual_data); } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +TEST_F(EnvPosixTest, LoadRocksDBLibrary) { + std::shared_ptr library; + std::function function; + Status status = env_->LoadLibrary("no-such-library", "", &library); + ASSERT_NOK(status); + ASSERT_EQ(nullptr, library.get()); + status = env_->LoadLibrary("rocksdb", "", &library); + if (status.ok()) { // If we have can find a rocksdb shared library + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(library->LoadFunction("rocksdb_create_default_env", + &function)); // from C definition + ASSERT_NE(nullptr, function); + ASSERT_NOK(library->LoadFunction("no-such-method", &function)); + ASSERT_EQ(nullptr, function); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } else { + ASSERT_EQ(nullptr, library.get()); + } +} +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) +TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) { + std::shared_ptr library; + std::function function; + ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } + char buff[1024]; + std::string cwd = getcwd(buff, sizeof(buff)); + + status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } +} +#endif // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION + TEST_P(EnvPosixTestWithParam, UnSchedule) { std::atomic called(false); env_->SetBackgroundThreads(1, Env::LOW); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 8f6bd607228..a8fe2fb78ea 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -41,6 +41,7 @@ namespace rocksdb { +class DynamicLibrary; class FileLock; class Logger; class RandomAccessFile; @@ -338,6 +339,18 @@ class Env { // REQUIRES: lock has not already been unlocked. virtual Status UnlockFile(FileLock* lock) = 0; + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + // Priority for scheduling job in thread pool enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL }; @@ -978,6 +991,29 @@ class FileLock { void operator=(const FileLock&); }; +class DynamicLibrary { + public: + typedef void* (*FunctionPtr)(); + virtual ~DynamicLibrary() {} + + /** Returns the name of the dynamic library */ + virtual const char* Name() const = 0; + + /** + * Loads the symbol for sym_name from the library and updates the input + * function. Returns the loaded symbol + */ + template + Status LoadFunction(const std::string& sym_name, std::function* function) { + FunctionPtr ptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast(ptr); + return s; + } + /** Loads and returns the symbol for sym_name from the library */ + virtual Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) = 0; +}; + extern void LogFlush(const std::shared_ptr& info_log); extern void Log(const InfoLogLevel log_level, @@ -1168,6 +1204,12 @@ class EnvWrapper : public Env { Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_->LoadLibrary(lib_name, search_path, result); + } + void Schedule(void (*f)(void* arg), void* a, Priority pri, void* tag = nullptr, void (*u)(void* arg) = nullptr) override { return target_->Schedule(f, a, pri, tag, u); From ebe89ef9d84cf1a05a47b8d03c7509f9f103ad10 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 4 Jun 2019 10:17:24 -0700 Subject: [PATCH 106/572] Fix merging range tombstone covering put during flush/compaction (#5406) Summary: Flush/compaction use `MergeUntil` which has a special code path to handle a merge ending with a non-`Merge` point key. In particular if that key is a `Put` we forgot to check whether it is covered by a range tombstone. If it is covered then we must not include it in the following call to `TimedFullMerge`. Fixes #5392. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5406 Differential Revision: D15611144 Pulled By: sagar0 fbshipit-source-id: ba6a7863ca2d043f591de78fd0c4f4561f0c500e --- HISTORY.md | 1 + db/db_range_del_test.cc | 24 ++++++++++++++++++++++++ db/merge_helper.cc | 10 +++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index b9b6998c6f5..b3c2ef14ac2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -24,6 +24,7 @@ ### Bug Fixes * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. +* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 16d682fc083..e58095b2d92 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -491,6 +491,30 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { ASSERT_EQ(expected, actual); } +TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { + // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4) + // Flush. The `CompactionIterator` previously had a bug where we forgot to + // check for covering range tombstones when processing the (1) Put, causing + // it to reappear after the flush. + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + std::string val; + PutFixed64(&val, 1); + ASSERT_OK(db_->Put(WriteOptions(), "key", val)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 1); + ASSERT_EQ(expected, actual); +} + // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 4a4d2fb714e..b5ae924ffc6 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -201,7 +201,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // want. Also if we're in compaction and it's a put, it would be nice to // run compaction filter on it. const Slice val = iter->value(); - const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr; + const Slice* val_ptr; + if (kTypeValue == ikey.type && + (range_del_agg == nullptr || + !range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal))) { + val_ptr = &val; + } else { + val_ptr = nullptr; + } std::string merge_result; s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, merge_context_.GetOperands(), &merge_result, logger_, From 227b5d52df103ef8722e537bd3ecd3445082b288 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Tue, 4 Jun 2019 10:51:22 -0700 Subject: [PATCH 107/572] Make RocksDB secondary instance respect atomic groups in version edits. (#5411) Summary: With this commit, RocksDB secondary instance respects atomic groups in version edits. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5411 Differential Revision: D15617512 Pulled By: HaoyuHuang fbshipit-source-id: 913f4ede391d772dcaf5649e3cd2099fa292d120 --- db/db_impl/db_secondary_test.cc | 2 +- db/version_edit.h | 1 + db/version_set.cc | 403 ++++++++++++++---------- db/version_set.h | 47 ++- db/version_set_test.cc | 542 +++++++++++++++++++++----------- 5 files changed, 647 insertions(+), 348 deletions(-) diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index e8eafd673ed..5b375422f02 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -373,7 +373,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", [&](void* arg) { Status s = *reinterpret_cast(arg); if (s.IsPathNotFound()) { diff --git a/db/version_edit.h b/db/version_edit.h index 471b4e095ab..e1857b37fc4 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -316,6 +316,7 @@ class VersionEdit { friend class ReactiveVersionSet; friend class VersionSet; friend class Version; + friend class AtomicGroupReadBuffer; bool GetLevel(Slice* input, int* level, const char** msg); diff --git a/db/version_set.cc b/db/version_set.cc index 26465a01a4e..a60a4e87cac 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3313,6 +3313,51 @@ struct VersionSet::ManifestWriter { edit_list(e) {} }; +Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { + assert(edit); + if (edit->is_in_atomic_group_) { + TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); + if (replay_buffer_.empty()) { + replay_buffer_.resize(edit->remaining_entries_ + 1); + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); + } + read_edits_in_atomic_group_++; + if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + static_cast(replay_buffer_.size())) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); + return Status::Corruption("corrupted atomic group"); + } + replay_buffer_[read_edits_in_atomic_group_ - 1] = std::move(*edit); + if (read_edits_in_atomic_group_ == replay_buffer_.size()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); + return Status::OK(); + } + return Status::OK(); + } + + // A normal edit. + if (!replay_buffer().empty()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); + return Status::Corruption("corrupted atomic group"); + } + return Status::OK(); +} + +bool AtomicGroupReadBuffer::IsFull() const { + return read_edits_in_atomic_group_ == replay_buffer_.size(); +} + +bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } + +void AtomicGroupReadBuffer::Clear() { + read_edits_in_atomic_group_ = 0; + replay_buffer_.clear(); +} + VersionSet::VersionSet(const std::string& dbname, const ImmutableDBOptions* _db_options, const EnvOptions& storage_options, Cache* table_cache, @@ -4071,6 +4116,74 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env, return Status::OK(); } +Status VersionSet::ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map>& + builders, + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { + assert(reader != nullptr); + assert(read_buffer != nullptr); + Status s; + Slice record; + std::string scratch; + size_t recovered_edits = 0; + while (reader->ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + s = read_buffer->AddEdit(&edit); + if (!s.ok()) { + break; + } + if (edit.is_in_atomic_group_) { + if (read_buffer->IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer->replay_buffer()) { + s = ApplyOneVersionEditToBuilder( + e, name_to_options, column_families_not_found, builders, + have_log_number, log_number, have_prev_log_number, + previous_log_number, have_next_file, next_file, + have_last_sequence, last_sequence, min_log_number_to_keep, + max_column_family); + if (!s.ok()) { + break; + } + recovered_edits++; + } + if (!s.ok()) { + break; + } + read_buffer->Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder( + edit, name_to_options, column_families_not_found, builders, + have_log_number, log_number, have_prev_log_number, + previous_log_number, have_next_file, next_file, have_last_sequence, + last_sequence, min_log_number_to_keep, max_column_family); + if (s.ok()) { + recovered_edits++; + } + } + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer->Clear(); + } + TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits", + &recovered_edits); + return s; +} + Status VersionSet::Recover( const std::vector& column_families, bool read_only) { @@ -4148,66 +4261,12 @@ Status VersionSet::Recover( true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; - std::vector replay_buffer; - size_t num_entries_decoded = 0; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - if (edit.is_in_atomic_group_) { - if (replay_buffer.empty()) { - replay_buffer.resize(edit.remaining_entries_ + 1); - TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup", - &edit); - } - ++num_entries_decoded; - if (num_entries_decoded + edit.remaining_entries_ != - static_cast(replay_buffer.size())) { - TEST_SYNC_POINT_CALLBACK( - "VersionSet::Recover:IncorrectAtomicGroupSize", &edit); - s = Status::Corruption("corrupted atomic group"); - break; - } - replay_buffer[num_entries_decoded - 1] = std::move(edit); - if (num_entries_decoded == replay_buffer.size()) { - TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup", - &edit); - for (auto& e : replay_buffer) { - s = ApplyOneVersionEditToBuilder( - e, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - if (!s.ok()) { - break; - } - } - replay_buffer.clear(); - num_entries_decoded = 0; - } - TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup"); - } else { - if (!replay_buffer.empty()) { - TEST_SYNC_POINT_CALLBACK( - "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit); - s = Status::Corruption("corrupted atomic group"); - break; - } - s = ApplyOneVersionEditToBuilder( - edit, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - } - if (!s.ok()) { - break; - } - } + AtomicGroupReadBuffer read_buffer; + s = ReadAndRecover( + &reader, &read_buffer, cf_name_to_options, column_families_not_found, + builders, &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, &have_last_sequence, + &last_sequence, &min_log_number_to_keep, &max_column_family); } if (s.ok()) { @@ -5218,19 +5277,11 @@ Status ReactiveVersionSet::Recover( assert(reader != nullptr); Slice record; std::string scratch; - while (s.ok() && reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - s = ApplyOneVersionEditToBuilder( - edit, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - } + s = ReadAndRecover( + reader, &read_buffer_, cf_name_to_options, column_families_not_found, + builders, &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, &have_last_sequence, + &last_sequence, &min_log_number_to_keep, &max_column_family); if (s.ok()) { bool enough = have_next_file && have_log_number && have_last_sequence; if (enough) { @@ -5350,7 +5401,7 @@ Status ReactiveVersionSet::ReadAndApply( uint64_t previous_log_number = 0; uint32_t max_column_family = 0; uint64_t min_log_number_to_keep = 0; - + uint64_t applied_edits = 0; while (s.ok()) { Slice record; std::string scratch; @@ -5362,73 +5413,46 @@ Status ReactiveVersionSet::ReadAndApply( if (!s.ok()) { break; } - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - // If we cannot find this column family in our column family set, then it - // may be a new column family created by the primary after the secondary - // starts. Ignore it for now. - if (nullptr == cfd) { - continue; - } - if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end()) { - std::unique_ptr builder_guard( - new BaseReferencedVersionBuilder(cfd)); - active_version_builders_.insert( - std::make_pair(edit.column_family_, std::move(builder_guard))); - } - s = ApplyOneVersionEditToBuilder( - edit, &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); + + s = read_buffer_.AddEdit(&edit); if (!s.ok()) { break; } - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - TEST_SYNC_POINT_CALLBACK( - "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s); - if (!s.ok() && !s.IsPathNotFound()) { - break; - } else if (s.IsPathNotFound()) { - s = Status::OK(); - } else { // s.ok() == true - auto version = new Version(cfd, this, env_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(version->storage_info()); - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyOneVersionEditToBuilder( + e, cfds_changed, &have_log_number, &log_number, + &have_prev_log_number, &previous_log_number, &have_next_file, + &next_file, &have_last_sequence, &last_sequence, + &min_log_number_to_keep, &max_column_family); + if (!s.ok()) { + break; + } + applied_edits++; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder( + edit, cfds_changed, &have_log_number, &log_number, + &have_prev_log_number, &previous_log_number, &have_next_file, + &next_file, &have_last_sequence, &last_sequence, + &min_log_number_to_keep, &max_column_family); + if (s.ok()) { + applied_edits++; } } - if (have_next_file) { - next_file_number_.store(next_file + 1); - } - if (have_last_sequence) { - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - } - if (have_prev_log_number) { - prev_log_number_ = previous_log_number; - MarkFileNumberUsed(previous_log_number); - } - if (have_log_number) { - MarkFileNumberUsed(log_number); - } - column_family_set_->UpdateMaxColumnFamily(max_column_family); - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer_.Clear(); } // It's possible that: // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. @@ -5457,52 +5481,113 @@ Status ReactiveVersionSet::ReadAndApply( } } } + TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits", + &applied_edits); return s; } Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* log_number, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family) { - ColumnFamilyData* cfd = nullptr; - Status status; + VersionEdit& edit, std::unordered_set* cfds_changed, + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + + // If we cannot find this column family in our column family set, then it + // may be a new column family created by the primary after the secondary + // starts. It is also possible that the secondary instance opens only a subset + // of column families. Ignore it for now. + if (nullptr == cfd) { + return Status::OK(); + } + if (active_version_builders_.find(edit.column_family_) == + active_version_builders_.end()) { + std::unique_ptr builder_guard( + new BaseReferencedVersionBuilder(cfd)); + active_version_builders_.insert( + std::make_pair(edit.column_family_, std::move(builder_guard))); + } + + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + if (edit.is_column_family_add_) { // TODO (yanqin) for now the secondary ignores column families created // after Open. This also simplifies handling of switching to a new MANIFEST // and processing the snapshot of the system at the beginning of the // MANIFEST. - return Status::OK(); } else if (edit.is_column_family_drop_) { - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Drop a CF created by primary after secondary starts? Then ignore - if (cfd == nullptr) { - return Status::OK(); - } // Drop the column family by setting it to be 'dropped' without destroying // the column family handle. + // TODO (haoyu) figure out how to handle column faimly drop for + // secondary instance. (Is it possible that the ref count for cfd is 0 but + // the ref count for its versions is higher than 0?) cfd->SetDropped(); if (cfd->Unref()) { delete cfd; cfd = nullptr; } } else { - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Operation on a CF created after Open? Then ignore - if (cfd == nullptr) { - return Status::OK(); - } - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); builder->Apply(&edit); } - return ExtractInfoFromVersionEdit( + Status s = ExtractInfoFromVersionEdit( cfd, edit, have_log_number, log_number, have_prev_log_number, previous_log_number, have_next_file, next_file, have_last_sequence, last_sequence, min_log_number_to_keep, max_column_family); + if (!s.ok()) { + return s; + } + + if (cfd != nullptr) { + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + TEST_SYNC_POINT_CALLBACK( + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:" + "AfterLoadTableHandlers", + &s); + + if (s.ok()) { + auto version = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + } else if (s.IsPathNotFound()) { + s = Status::OK(); + } + // Some other error has occurred during LoadTableHandlers. + } + + if (have_next_file) { + next_file_number_.store(*next_file + 1); + } + if (have_last_sequence) { + last_allocated_sequence_ = *last_sequence; + last_published_sequence_ = *last_sequence; + last_sequence_ = *last_sequence; + } + if (have_prev_log_number) { + prev_log_number_ = *previous_log_number; + MarkFileNumberUsed(*previous_log_number); + } + if (have_log_number) { + MarkFileNumberUsed(*log_number); + } + column_family_set_->UpdateMaxColumnFamily(*max_column_family); + MarkMinLogNumberToKeep2PC(*min_log_number_to_keep); + return s; } Status ReactiveVersionSet::MaybeSwitchManifest( diff --git a/db/version_set.h b/db/version_set.h index c43e4091442..dc9e759655e 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -752,6 +752,23 @@ struct ObsoleteFileInfo { class BaseReferencedVersionBuilder; +class AtomicGroupReadBuffer { + public: + Status AddEdit(VersionEdit* edit); + void Clear(); + bool IsFull() const; + bool IsEmpty() const; + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_edits_in_atomic_group_; + } + std::vector& replay_buffer() { return replay_buffer_; } + + private: + uint64_t read_edits_in_atomic_group_ = 0; + std::vector replay_buffer_; +}; + // VersionSet is the collection of versions of all the column families of the // database. Each database owns one VersionSet. A VersionSet has access to all // column families via ColumnFamilySet, i.e. set of the column families. @@ -1028,6 +1045,18 @@ class VersionSet { ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); + Status ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& + name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map< + uint32_t, std::unique_ptr>& builders, + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family); + // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( VersionEdit& edit, @@ -1135,16 +1164,23 @@ class ReactiveVersionSet : public VersionSet { std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status); + uint64_t TEST_read_edits_in_atomic_group() const { + return read_buffer_.TEST_read_edits_in_atomic_group(); + } + std::vector& replay_buffer() { + return read_buffer_.replay_buffer(); + } + protected: using VersionSet::ApplyOneVersionEditToBuilder; // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* log_number, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family); + VersionEdit& edit, std::unordered_set* cfds_changed, + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family); Status MaybeSwitchManifest( log::Reader::Reporter* reporter, @@ -1153,6 +1189,7 @@ class ReactiveVersionSet : public VersionSet { private: std::unordered_map> active_version_builders_; + AtomicGroupReadBuffer read_buffer_; using VersionSet::LogAndApply; using VersionSet::Recover; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 77890d82638..bf9ef8e39fe 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -607,6 +607,7 @@ class VersionSetTestBase { const static std::string kColumnFamilyName1; const static std::string kColumnFamilyName2; const static std::string kColumnFamilyName3; + int num_initial_edits_; VersionSetTestBase() : env_(Env::Default()), @@ -618,6 +619,9 @@ class VersionSetTestBase { versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_)), + reactive_versions_(std::make_shared( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_)), shutting_down_(false), mock_table_factory_(std::make_shared()) { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); @@ -653,7 +657,7 @@ class VersionSetTestBase { new_cfs.emplace_back(new_cf); } *last_seqno = last_seq; - + num_initial_edits_ = static_cast(new_cfs.size() + 1); const std::string manifest = DescriptorFileName(dbname_, 1); std::unique_ptr file; Status s = env_->NewWritableFile( @@ -708,6 +712,7 @@ class VersionSetTestBase { WriteController write_controller_; WriteBufferManager write_buffer_manager_; std::shared_ptr versions_; + std::shared_ptr reactive_versions_; InstrumentedMutex mutex_; std::atomic shutting_down_; std::shared_ptr mock_table_factory_; @@ -758,216 +763,388 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { EXPECT_EQ(kGroupSize - 1, count); } -TEST_F(VersionSetTest, HandleValidAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); +class VersionSetAtomicGroupTest : public VersionSetTestBase, + public testing::Test { + public: + VersionSetAtomicGroupTest() : VersionSetTestBase() {} - // Append multiple version edits that form an atomic group - const int kAtomicGroupSize = 3; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - edits[i].MarkAtomicGroup(--remaining); - edits[i].SetLastSequence(last_seqno++); + void SetUp() override { + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + SetupTestSyncPoints(); } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); - } - log_writer.reset(); - - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); + void SetupValidAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - bool first_in_atomic_group = false; - bool last_in_atomic_group = false; + void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.front().DebugString(), - e->DebugString()); // compare based on value - first_in_atomic_group = true; - }); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.back().DebugString(), - e->DebugString()); // compare based on value - EXPECT_TRUE(first_in_atomic_group); - last_in_atomic_group = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); + void SetupCorruptedAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != ((size_t)atomic_group_size / 2)) { + edits_[i].MarkAtomicGroup(--remaining); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), - versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(first_in_atomic_group); - EXPECT_TRUE(last_in_atomic_group); -} + void SetupIncorrectAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != 1) { + edits_[i].MarkAtomicGroup(--remaining); + } else { + edits_[i].MarkAtomicGroup(remaining--); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } -TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); + void SetupTestSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.front().DebugString(), + e->DebugString()); // compare based on value + first_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.back().DebugString(), + e->DebugString()); // compare based on value + EXPECT_TRUE(first_in_atomic_group_); + last_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) { + num_recovered_edits_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ReadAndApply:AppliedEdits", + [&](void* arg) { num_applied_edits_ = *reinterpret_cast(arg); }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroup", + [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", + [&](void* arg) { + corrupted_edit_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", + [&](void* arg) { + edit_with_incorrect_group_size_ = + *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->EnableProcessing(); + } - // Append multiple version edits that form an atomic group - const int kAtomicGroupSize = 4; - const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; - std::vector edits(kNumberOfPersistedVersionEdits); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - edits[i].MarkAtomicGroup(--remaining); - edits[i].SetLastSequence(last_seqno++); + void AddNewEditsToLog(int num_edits) { + for (int i = 0; i < num_edits; i++) { + std::string record; + edits_[i].EncodeTo(&record); + ASSERT_OK(log_writer_->AddRecord(record)); + } } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); + + void TearDown() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + log_writer_.reset(); } - log_writer.reset(); - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); + protected: + std::vector column_families_; + SequenceNumber last_seqno_; + std::vector edits_; + bool first_in_atomic_group_ = false; + bool last_in_atomic_group_ = false; + int num_edits_in_atomic_group_ = 0; + int num_recovered_edits_ = 0; + int num_applied_edits_ = 0; + VersionEdit corrupted_edit_; + VersionEdit edit_with_incorrect_group_size_; + std::unique_ptr log_writer_; +}; - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - bool first_in_atomic_group = false; - bool last_in_atomic_group = false; - size_t num = 0; +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.front().DebugString(), - e->DebugString()); // compare based on value - first_in_atomic_group = true; - }); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:LastInAtomicGroup", - [&](void* /* arg */) { last_in_atomic_group = true; }); - SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup", - [&](void* /* arg */) { ++num; }); - SyncPoint::GetInstance()->EnableProcessing(); +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(first_in_atomic_group); - EXPECT_FALSE(last_in_atomic_group); - EXPECT_EQ(kNumberOfPersistedVersionEdits, num); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); } -TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); - - // Append multiple version edits that form an atomic group +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) { const int kAtomicGroupSize = 4; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - if (i != (kAtomicGroupSize / 2)) { - edits[i].MarkAtomicGroup(--remaining); - } - edits[i].SetLastSequence(last_seqno++); - } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); - } - log_writer.reset(); - - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + // Write the last record. The reactive version set should now apply all + // edits. + std::string last_record; + edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); + EXPECT_OK(log_writer_->AddRecord(last_record)); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + // Reactive version set should be empty now. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + // No edits in an atomic group. + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + // Write a few edits in an atomic group. + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - bool mixed = false; - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString()); - mixed = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); - EXPECT_NOK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(mixed); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); } -TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} - // Append multiple version edits that form an atomic group +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) { const int kAtomicGroupSize = 4; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - if (i != 1) { - edits[i].MarkAtomicGroup(--remaining); - } else { - edits[i].MarkAtomicGroup(remaining--); - } - edits[i].SetLastSequence(last_seqno++); - } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); - } - log_writer.reset(); + SetupCorruptedAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + // Write the corrupted edits. + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} - bool incorrect_group_size = false; - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits[1].DebugString(), e->DebugString()); - incorrect_group_size = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); - EXPECT_NOK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), - versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(incorrect_group_size); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); } class VersionSetTestDropOneCF : public VersionSetTestBase, @@ -1088,7 +1265,6 @@ INSTANTIATE_TEST_CASE_P( testing::Values(VersionSetTestBase::kColumnFamilyName1, VersionSetTestBase::kColumnFamilyName2, VersionSetTestBase::kColumnFamilyName3)); - } // namespace rocksdb int main(int argc, char** argv) { From 0153e14569c30f225d7a08050acbf10c4d211d41 Mon Sep 17 00:00:00 2001 From: anand76 Date: Wed, 5 Jun 2019 09:38:23 -0700 Subject: [PATCH 108/572] Add a MultiRead() method to Env (#5311) Summary: Define the Env:: MultiRead() method to allow callers to request multiple block reads in one shot. The underlying Env implementation can parallelize it if it chooses to in order to reduce the overall IO latency. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5311 Differential Revision: D15502172 Pulled By: anand1976 fbshipit-source-id: 2b228269c2e11b5f54694d6b2bb3119c8a8ce2b9 --- env/env_test.cc | 53 +++++++++++++++++++++++++++++++++++++++++++ include/rocksdb/env.h | 39 +++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/env/env_test.cc b/env/env_test.cc index 30d5b528217..a2b6db5c475 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -1105,6 +1105,59 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { } } +TEST_P(EnvPosixTestWithParam, MultiRead) { + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kSectorSize = 4096; + const size_t kNumSectors = 8; + + // Create file. + { + std::unique_ptr wfile; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) + if (soptions.use_direct_writes) { + soptions.use_direct_writes = false; + } +#endif + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + for (size_t i = 0; i < kNumSectors; ++i) { + auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice)); + } + ASSERT_OK(wfile->Close()); + } + + // Random Read + { + std::unique_ptr file; + std::vector reqs(3); + std::vector> data; + uint64_t offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + reqs[i].offset = offset; + offset += 2 * kSectorSize; + reqs[i].len = kSectorSize; + data.emplace_back(NewAligned(kSectorSize, 0)); + reqs[i].scratch = data.back().get(); + } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } +#endif + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + for (size_t i = 0; i < reqs.size(); ++i) { + auto buf = NewAligned(kSectorSize * 8, static_cast(i*2 + 1)); + ASSERT_OK(reqs[i].status); + ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0); + } + } +} + // Only works in linux platforms #ifdef OS_WIN TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) { diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index a8fe2fb78ea..0a055cea0bf 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -583,6 +583,26 @@ class SequentialFile { // SequentialFileWrapper too. }; +// A read IO request structure for use in MultiRead +struct ReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + Status status; +}; + // A file abstraction for randomly reading the contents of a file. class RandomAccessFile { public: @@ -607,6 +627,22 @@ class RandomAccessFile { return Status::OK(); } + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for any + // any errors that occur before even processing specific read requests + virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest& req = reqs[i]; + req.status = Read(req.offset, req.len, &req.result, req.scratch); + } + return Status::OK(); + } + // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). // Furthermore, it tries to make this ID at most "max_size" bytes. If such an @@ -1357,6 +1393,9 @@ class RandomAccessFileWrapper : public RandomAccessFile { char* scratch) const override { return target_->Read(offset, n, result, scratch); } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + return target_->MultiRead(reqs, num_reqs); + } Status Prefetch(uint64_t offset, size_t n) override { return target_->Prefetch(offset, n); } From 267b9b109176f51e59604233d6bef5293278f2a1 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 5 Jun 2019 13:56:46 -0700 Subject: [PATCH 109/572] Disable dynamic extension support by default for CMake (#5419) Summary: We have users reporting linking error while building RocksDB using CMake, and we do not enable dynamic extension feature for them. The fix is to add `-DROCKSDB_NO_DYNAMIC_EXTENSION` to CMake by default. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5419 Differential Revision: D15676792 Pulled By: riversand963 fbshipit-source-id: d45aaacfc64ea61646fd7329c352cd760145baf3 --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cb4cc7a863..354697b05bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -332,6 +332,10 @@ if(DISABLE_STALL_NOTIF) add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION) endif() +option(WITH_DYNAMIC_EXTENSION "build with dynamic extension support" OFF) +if(NOT WITH_DYNAMIC_EXTENSION) + add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION) +endif() if(DEFINED USE_RTTI) if(USE_RTTI) @@ -488,7 +492,7 @@ set(SOURCES db/compacted_db_impl.cc db/compaction/compaction.cc db/compaction/compaction_iterator.cc - db/compaction/compaction_picker.cc + db/compaction/compaction_picker.cc db/compaction/compaction_job.cc db/compaction/compaction_picker_fifo.cc db/compaction/compaction_picker_level.cc From cb1bf09bfc912472b380d09ee2b733a6684457d7 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 5 Jun 2019 15:16:43 -0700 Subject: [PATCH 110/572] Fix tsan error (#5414) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Previous code has a warning when compile with tsan, leading to an error since we have -Werror. Compilation result ``` In file included from ./env/env_chroot.h:12, from env/env_test.cc:40: ./include/rocksdb/env.h: In instantiation of ‘rocksdb::Status rocksdb::DynamicLibrary::LoadFunction(const string&, std::function*) [with T = void*(void*, const char*); std::__cxx11::string = std::__cxx11::basic_string]’: env/env_test.cc:260:5: required from here ./include/rocksdb/env.h:1010:17: error: cast between incompatible function types from ‘rocksdb::DynamicLibrary::FunctionPtr’ {aka ‘void* (*)()’} to ‘void* (*)(void*, const char*)’ [-Werror=cast-function-type] *function = reinterpret_cast(ptr); ^~~~~~~~~~~~~~~~~~~~~~~~~ cc1plus: all warnings being treated as errors make: *** [env/env_test.o] Error 1 ``` It also has another error reported by clang ``` env/env_posix.cc:141:11: warning: Value stored to 'err' during its initialization is never read char* err = dlerror(); // Clear any old error ^~~ ~~~~~~~~~ 1 warning generated. ``` Test plan (on my devserver). ``` $make clean $OPT=-g ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1 make -j32 $ $make clean $USE_CLANG=1 TEST_TMPDIR=/dev/shm/rocksdb OPT=-g make -j1 analyze ``` Both should pass. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5414 Differential Revision: D15637315 Pulled By: riversand963 fbshipit-source-id: 8e307483761019a4d5998cab92d49516d7edffbf --- env/env_posix.cc | 27 +++++++++++++-------------- include/rocksdb/env.h | 16 +++++++--------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index f1a0907c9fe..c0edb00968e 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -137,13 +137,14 @@ class PosixDynamicLibrary : public DynamicLibrary { : name_(name), handle_(handle) {} ~PosixDynamicLibrary() override { dlclose(handle_); } - Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) override { - char* err = dlerror(); // Clear any old error - *func = (FunctionPtr)dlsym(handle_, sym_name.c_str()); + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); if (*func != nullptr) { return Status::OK(); } else { - err = dlerror(); + char* err = dlerror(); return Status::NotFound("Error finding symbol: " + sym_name, err); } } @@ -771,16 +772,14 @@ class PosixEnv : public Env { } #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION - /** - * Loads the named library into the result. - * If the input name is empty, the current executable is loaded - * On *nix systems, a "lib" prefix is added to the name if one is not supplied - * Comparably, the appropriate shared library extension is added to the name - * if not supplied. If search_path is not specified, the shared library will - * be loaded using the default path (LD_LIBRARY_PATH) If search_path is - * specified, the shared library will be searched for in the directories - * provided by the search path - */ + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path Status LoadLibrary(const std::string& name, const std::string& path, std::shared_ptr* result) override { Status status; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 0a055cea0bf..ba8978dc810 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1029,25 +1029,23 @@ class FileLock { class DynamicLibrary { public: - typedef void* (*FunctionPtr)(); virtual ~DynamicLibrary() {} - /** Returns the name of the dynamic library */ + // Returns the name of the dynamic library. virtual const char* Name() const = 0; - /** - * Loads the symbol for sym_name from the library and updates the input - * function. Returns the loaded symbol - */ + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. template Status LoadFunction(const std::string& sym_name, std::function* function) { - FunctionPtr ptr; + assert(nullptr != function); + void* ptr = nullptr; Status s = LoadSymbol(sym_name, &ptr); *function = reinterpret_cast(ptr); return s; } - /** Loads and returns the symbol for sym_name from the library */ - virtual Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) = 0; + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; }; extern void LogFlush(const std::shared_ptr& info_log); From 340ed4fac751025dcf4368affabf950b3a417a05 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 5 Jun 2019 23:07:28 -0700 Subject: [PATCH 111/572] Add support for timestamp in Get/Put (#5079) Summary: It's useful to be able to (optionally) associate key-value pairs with user-provided timestamps. This PR is an early effort towards this goal and continues the work of facebook#4942. A suite of new unit tests exist in DBBasicTestWithTimestampWithParam. Support for timestamp requires the user to provide timestamp as a slice in `ReadOptions` and `WriteOptions`. All timestamps of the same database must share the same length, format, etc. The format of the timestamp is the same throughout the same database, and the user is responsible for providing a comparator function (Comparator) to order the tuples. Once created, the format and length of the timestamp cannot change (at least for now). Test plan (on devserver): ``` $COMPILE_WITH_ASAN=1 make -j32 all $./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/* $make check ``` All tests must pass. We also run the following db_bench tests to verify whether there is regression on Get/Put while timestamp is not enabled. ``` $TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000 $TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 ``` Repeat for 6 times for both versions. Results are as follows: ``` | | readrandom | fillrandom | | master | 16.77 MB/s | 47.05 MB/s | | PR5079 | 16.44 MB/s | 47.03 MB/s | ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5079 Differential Revision: D15132946 Pulled By: riversand963 fbshipit-source-id: 833a0d657eac21182f0f206c910a6438154c742c --- HISTORY.md | 1 + db/db_basic_test.cc | 151 ++++++++++++++++++ db/db_impl/db_impl.cc | 11 +- db/db_impl/db_impl_write.cc | 24 ++- db/dbformat.h | 27 ++++ db/memtable.cc | 16 +- db/version_set.cc | 43 ++--- include/rocksdb/comparator.h | 27 ++++ include/rocksdb/options.h | 22 ++- options/options.cc | 6 +- .../block_based/block_based_table_builder.cc | 3 +- table/block_based/block_based_table_reader.cc | 17 +- table/get_context.cc | 2 +- util/comparator.cc | 8 + 14 files changed, 318 insertions(+), 40 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b3c2ef14ac2..028ddcf8253 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ * Partitions of partitioned indexes no longer affect the read amplification statistics. * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. +* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 45524b250f7..1aec864dd6f 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -1284,6 +1284,157 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { } } } + +class DBBasicTestWithTimestampWithParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestWithTimestampWithParam() + : DBTestBase("/db_basic_test_with_timestamp") {} + + protected: + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + assert(a.size() >= timestamp_size()); + assert(b.size() >= timestamp_size()); + Slice k1 = StripTimestampFromUserKey(a, timestamp_size()); + Slice k2 = StripTimestampFromUserKey(b, timestamp_size()); + + return cmp_without_ts_->Compare(k1, k2); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + auto* ptr1 = const_cast(&ts1); + auto* ptr2 = const_cast(&ts2); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return 1; + } else if (high1 > high2) { + return -1; + } + if (low1 < low2) { + return 1; + } else if (low1 > low2) { + return -1; + } + return 0; + } + }; + + Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) { + assert(nullptr != ts); + ts->clear(); + PutFixed64(ts, low); + PutFixed64(ts, high); + assert(ts->size() == sizeof(low) + sizeof(high)); + return Slice(*ts); + } +}; + +TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 6; + bool memtable_only = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + std::string tmp; + size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_strs(kNumTimestamps); + std::vector read_ts_strs(kNumTimestamps); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); + read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); + const Slice& write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK(Put(cf, "key" + std::to_string(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + } + if (!memtable_only) { + ASSERT_OK(Flush(cf)); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + ropts.timestamp = &read_ts_list[i]; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); +} + +INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam, + ::testing::Bool()); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index ba76abc2875..96b911a6d37 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1376,7 +1376,16 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value); + if (nullptr == read_options.timestamp) { + return GetImpl(read_options, column_family, key, value); + } + Slice akey; + std::string buf; + Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf); + if (s.ok()) { + s = GetImpl(read_options, column_family, akey, value); + } + return s; } Status DBImpl::GetImpl(const ReadOptions& read_options, diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 02e23e26931..947194ace19 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1677,11 +1677,25 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { // can call if they wish Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - // Pre-allocate size of write batch conservatively. - // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, - // and we allocate 11 extra bytes for key length, as well as value length. - WriteBatch batch(key.size() + value.size() + 24); - Status s = batch.Put(column_family, key, value); + if (nullptr == opt.timestamp) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + Slice akey; + std::string buf; + Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf); + if (!s.ok()) { + return s; + } + WriteBatch batch(akey.size() + value.size() + 24); + s = batch.Put(column_family, akey, value); if (!s.ok()) { return s; } diff --git a/db/dbformat.h b/db/dbformat.h index dbf6ea6f3c9..c6ee5677c09 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -151,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - 8); } +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + assert(internal_key.size() >= 8 + ts_sz); + return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz); +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data(), user_key.size() - ts_sz); +} + inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { assert(internal_key.size() >= 8); const size_t n = internal_key.size(); @@ -658,4 +669,20 @@ struct ParsedInternalKeyComparator { const InternalKeyComparator* cmp; }; +// TODO (yanqin): this causes extra memory allocation and copy. Should be +// addressed in the future. +inline Status AppendTimestamp(const Slice& key, const Slice& timestamp, + Slice* ret_key, std::string* ret_buf) { + assert(ret_key != nullptr); + assert(ret_buf != nullptr); + if (key.data() + key.size() == timestamp.data()) { + *ret_key = Slice(key.data(), key.size() + timestamp.size()); + } else { + ret_buf->assign(key.data(), key.size()); + ret_buf->append(timestamp.data(), timestamp.size()); + *ret_key = Slice(*ret_buf); + } + return Status::OK(); +} + } // namespace rocksdb diff --git a/db/memtable.cc b/db/memtable.cc index 46acbbfa61a..fdd1a577ade 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -493,6 +493,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (!allow_concurrent) { // Extract prefix for insert with hint. if (insert_with_hint_prefix_extractor_ != nullptr && @@ -525,7 +527,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->Add(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(key); + bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); } // The first sequence number inserted into the memtable @@ -559,7 +561,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(key); + bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); } // atomically update first_seqno_ and earliest_seqno_. @@ -632,8 +634,10 @@ static bool SaveValue(void* arg, const char* entry) { // all entries with overly large sequence numbers. uint32_t key_length; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); - if (s->mem->GetInternalKeyComparator().user_comparator()->Equal( - Slice(key_ptr, key_length - 8), s->key->user_key())) { + Slice user_key_slice = Slice(key_ptr, key_length - 8); + if (s->mem->GetInternalKeyComparator() + .user_comparator() + ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; @@ -767,11 +771,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = bloom_filter_->MayContain(user_key); + may_contain = + bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); } else { assert(prefix_extractor_); may_contain = diff --git a/db/version_set.cc b/db/version_set.cc index a60a4e87cac..ed9a316ac72 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -93,7 +93,8 @@ Status OverlapWithIterator(const Comparator* ucmp, return Status::Corruption("DB have corrupted keys"); } - if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) { + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { *overlap = true; } } @@ -171,17 +172,16 @@ class FilePicker { // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. - assert( - curr_level_ == 0 || - curr_index_in_curr_level_ == start_index_in_curr_level_ || - user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)) <= 0); - - int cmp_smallest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)); + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); if (cmp_smallest >= 0) { - cmp_largest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->largest_key)); + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); } // Setup file search bound for the next level based on the @@ -799,14 +799,16 @@ static bool AfterFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs before all keys and is therefore never after *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); } static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs after all keys and is therefore never before *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); } bool SomeFileOverlapsRange( @@ -952,8 +954,9 @@ class LevelIterator final : public InternalIterator { bool KeyReachedUpperBound(const Slice& internal_key) { return read_options_.iterate_upper_bound != nullptr && - user_comparator_.Compare(ExtractUserKey(internal_key), - *read_options_.iterate_upper_bound) >= 0; + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) >= 0; } InternalIterator* NewFileIterator() { @@ -2774,11 +2777,12 @@ void VersionStorageInfo::GetOverlappingInputs( FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_limit = ExtractUserKey(f->largest_key); - if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { // "f" is completely before specified range; skip it iter++; } else if (end != nullptr && - user_cmp->Compare(file_start, user_end) > 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { // "f" is completely after specified range; skip it iter++; } else { @@ -2793,10 +2797,11 @@ void VersionStorageInfo::GetOverlappingInputs( iter = index.erase(iter); if (expand_range) { if (begin != nullptr && - user_cmp->Compare(file_start, user_begin) < 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { user_begin = file_start; } - if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) { + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { user_end = file_limit; } } diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 46279f9a693..9f262367d11 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -20,6 +20,19 @@ class Slice; // from multiple threads. class Comparator { public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + virtual ~Comparator() {} // Three-way comparison. Returns value: @@ -78,6 +91,20 @@ class Comparator { // The major use case is to determine if DataBlockHashIndex is compatible // with the customized comparator. virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return Compare(a, b); + } + + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + private: + size_t timestamp_size_; }; // Return a builtin comparator that uses lexicographic byte-wise diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index cc7119410a0..307582fe678 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1255,6 +1255,14 @@ struct ReadOptions { // Default: 0 (don't filter by seqnum, return user keys) SequenceNumber iter_start_seqnum; + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; @@ -1307,12 +1315,24 @@ struct WriteOptions { // Default: false bool low_pri; + // Timestamp of write operation, e.g. Put. All timestamps of the same + // database must share the same length and format. The user is also + // responsible for providing a customized compare function via Comparator to + // order tuples. If the user wants to enable timestamp, then + // all write operations must be associated with timestamp because RocksDB, as + // a single-node storage engine currently has no knowledge of global time, + // thus has to rely on the application. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + WriteOptions() : sync(false), disableWAL(false), ignore_missing_column_families(false), no_slowdown(false), - low_pri(false) {} + low_pri(false), + timestamp(nullptr) {} }; // Options that control flush operations diff --git a/options/options.cc b/options/options.cc index a5037ee78d3..8977b58905f 100644 --- a/options/options.cc +++ b/options/options.cc @@ -600,7 +600,8 @@ ReadOptions::ReadOptions() pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -618,6 +619,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache) pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} } // namespace rocksdb diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 9769e394f87..cae93f7f26f 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -531,7 +531,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // Note: PartitionedFilterBlockBuilder requires key being added to filter // builder after being added to index builder. if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { - r->filter_builder->Add(ExtractUserKey(key)); + size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); } r->last_key.assign(key.data(), key.size()); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 2fdaf2afd2a..37bbc3b52b3 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2672,8 +2672,11 @@ bool BlockBasedTable::FullFilterKeyMayMatch( const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; if (filter->whole_key_filtering()) { - may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid, - no_io, const_ikey_ptr); + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, + kNotValid, no_io, const_ikey_ptr); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && @@ -2755,6 +2758,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, iiter_unique_ptr.reset(iiter); } + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { @@ -2762,8 +2767,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && - !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, - handle.offset(), no_io); + !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), + prefix_extractor, handle.offset(), no_io); if (not_exist_in_filter) { // Not found @@ -2793,7 +2798,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } bool may_exist = biter.SeekForGet(key); - if (!may_exist) { + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { // HashSeek cannot find the key this block and the the iter is not // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break diff --git a/table/get_context.cc b/table/get_context.cc index 24c9ba7d5b7..9be16b0627d 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); - if (ucmp_->Equal(parsed_key.user_key, user_key_)) { + if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) { *matched = true; // If the value is not in the snapshot, skip it if (!CheckCallback(parsed_key.sequence)) { diff --git a/util/comparator.cc b/util/comparator.cc index eab17ebccf3..717ebb52353 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator { bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + return a.compare(b); + } }; class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { @@ -192,6 +196,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + return -a.compare(b); + } }; }// namespace From aa71718ac3f5c2ed41f44f2dd5aa51aac6c1583e Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 6 Jun 2019 11:21:11 -0700 Subject: [PATCH 112/572] Add block cache tracer. (#5410) Summary: This PR adds a help class block cache tracer to read/write block cache accesses. It uses the trace reader/writer to perform this task. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5410 Differential Revision: D15612843 Pulled By: HaoyuHuang fbshipit-source-id: f30fd1e1524355ca87db5d533a5c086728b141ea --- CMakeLists.txt | 1 + Makefile | 4 + src.mk | 2 + trace_replay/block_cache_tracer.cc | 218 ++++++++++++++++++++++++ trace_replay/block_cache_tracer.h | 105 ++++++++++++ trace_replay/block_cache_tracer_test.cc | 167 ++++++++++++++++++ trace_replay/trace_replay.cc | 38 +++-- trace_replay/trace_replay.h | 15 ++ 8 files changed, 538 insertions(+), 12 deletions(-) create mode 100644 trace_replay/block_cache_tracer.cc create mode 100644 trace_replay/block_cache_tracer.h create mode 100644 trace_replay/block_cache_tracer_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 354697b05bb..cef1f85d797 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -628,6 +628,7 @@ set(SOURCES tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc trace_replay/trace_replay.cc + trace_replay/block_cache_tracer.cc util/bloom.cc util/coding.cc util/compaction_job_stats_impl.cc diff --git a/Makefile b/Makefile index 080e0713355..3ee85ad67d0 100644 --- a/Makefile +++ b/Makefile @@ -561,6 +561,7 @@ TESTS = \ range_del_aggregator_test \ sst_file_reader_test \ db_secondary_test \ + block_cache_tracer_test \ PARALLEL_TEST = \ backupable_db_test \ @@ -1588,6 +1589,9 @@ sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local diff --git a/src.mk b/src.mk index c172d0b2c2d..6303997cd59 100644 --- a/src.mk +++ b/src.mk @@ -143,6 +143,7 @@ LIB_SOURCES = \ test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ trace_replay/trace_replay.cc \ + trace_replay/block_cache_tracer.cc \ util/bloom.cc \ util/build_version.cc \ util/coding.cc \ @@ -371,6 +372,7 @@ MAIN_SOURCES = \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ tools/trace_analyzer_test.cc \ + trace_replay/block_cache_tracer_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ util/coding_test.cc \ diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc new file mode 100644 index 00000000000..8d0119a6891 --- /dev/null +++ b/trace_replay/block_cache_tracer.cc @@ -0,0 +1,218 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/block_cache_tracer.h" + +#include "db/db_impl/db_impl.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace rocksdb { + +namespace { +const unsigned int kCharSize = 1; +bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) { + return (record.block_type == TraceType::kBlockTraceDataBlock) && + (record.caller == BlockCacheLookupCaller::kUserGet || + record.caller == BlockCacheLookupCaller::kUserMGet); +} +} // namespace + +BlockCacheTraceWriter::BlockCacheTraceWriter( + Env* env, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) + : env_(env), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)) {} + +bool BlockCacheTraceWriter::ShouldTrace( + const BlockCacheTraceRecord& record) const { + if (trace_options_.sampling_frequency == 0 || + trace_options_.sampling_frequency == 1) { + return true; + } + // We use spatial downsampling so that we have a complete access history for a + // block. + const uint64_t hash = GetSliceNPHash64(Slice(record.block_key)); + return hash % trace_options_.sampling_frequency == 0; +} + +Status BlockCacheTraceWriter::WriteBlockAccess( + const BlockCacheTraceRecord& record) { + uint64_t trace_file_size = trace_writer_->GetFileSize(); + if (trace_file_size > trace_options_.max_trace_file_size || + !ShouldTrace(record)) { + return Status::OK(); + } + Trace trace; + trace.ts = record.access_timestamp; + trace.type = record.block_type; + PutLengthPrefixedSlice(&trace.payload, record.block_key); + PutFixed64(&trace.payload, record.block_size); + PutFixed32(&trace.payload, record.cf_id); + PutLengthPrefixedSlice(&trace.payload, record.cf_name); + PutFixed32(&trace.payload, record.level); + PutFixed32(&trace.payload, record.sst_fd_number); + trace.payload.push_back(record.caller); + trace.payload.push_back(record.is_cache_hit); + trace.payload.push_back(record.no_insert); + if (ShouldTraceReferencedKey(record)) { + PutLengthPrefixedSlice(&trace.payload, record.referenced_key); + PutFixed64(&trace.payload, record.num_keys_in_block); + trace.payload.push_back(record.is_referenced_key_exist_in_block); + } + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + return trace_writer_->Write(encoded_trace); +} + +Status BlockCacheTraceWriter::WriteHeader() { + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = TraceType::kTraceBegin; + PutLengthPrefixedSlice(&trace.payload, kTraceMagic); + PutFixed32(&trace.payload, kMajorVersion); + PutFixed32(&trace.payload, kMinorVersion); + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + return trace_writer_->Write(encoded_trace); +} + +BlockCacheTraceReader::BlockCacheTraceReader( + std::unique_ptr&& reader) + : trace_reader_(std::move(reader)) {} + +Status BlockCacheTraceReader::ReadHeader(BlockCacheTraceHeader* header) { + assert(header != nullptr); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + header->start_time = trace.ts; + Slice enc_slice = Slice(trace.payload); + Slice magnic_number; + if (!GetLengthPrefixedSlice(&enc_slice, &magnic_number)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read the magic number."); + } + if (magnic_number.ToString() != kTraceMagic) { + return Status::Corruption( + "Corrupted header in the trace file: Magic number does not match."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb major " + "version number."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb minor " + "version number."); + } + // We should have retrieved all information in the header. + if (!enc_slice.empty()) { + return Status::Corruption( + "Corrupted header in the trace file: The length of header is too " + "long."); + } + return Status::OK(); +} + +Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { + assert(record); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + record->access_timestamp = trace.ts; + record->block_type = trace.type; + Slice enc_slice = Slice(trace.payload); + Slice block_key; + if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) { + return Status::Incomplete( + "Incomplete access record: Failed to read block key."); + } + record->block_key = block_key.ToString(); + if (!GetFixed64(&enc_slice, &record->block_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read block size."); + } + if (!GetFixed32(&enc_slice, &record->cf_id)) { + return Status::Incomplete( + "Incomplete access record: Failed to read column family ID."); + } + Slice cf_name; + if (!GetLengthPrefixedSlice(&enc_slice, &cf_name)) { + return Status::Incomplete( + "Incomplete access record: Failed to read column family name."); + } + record->cf_name = cf_name.ToString(); + if (!GetFixed32(&enc_slice, &record->level)) { + return Status::Incomplete( + "Incomplete access record: Failed to read level."); + } + if (!GetFixed32(&enc_slice, &record->sst_fd_number)) { + return Status::Incomplete( + "Incomplete access record: Failed to read SST file number."); + } + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read caller."); + } + record->caller = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read is_cache_hit."); + } + record->is_cache_hit = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read no_insert."); + } + record->no_insert = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + + if (ShouldTraceReferencedKey(*record)) { + Slice referenced_key; + if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the referenced key."); + } + record->referenced_key = referenced_key.ToString(); + if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the number of keys in the " + "block."); + } + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read " + "is_referenced_key_exist_in_block."); + } + record->is_referenced_key_exist_in_block = + static_cast(enc_slice[0]); + } + return Status::OK(); +} + +} // namespace rocksdb diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h new file mode 100644 index 00000000000..7b3c82e2b7e --- /dev/null +++ b/trace_replay/block_cache_tracer.h @@ -0,0 +1,105 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/trace_reader_writer.h" +#include "trace_replay/trace_replay.h" + +namespace rocksdb { + +enum BlockCacheLookupCaller : char { + kUserGet = 1, + kUserMGet = 2, + kUserIterator = 3, + kPrefetch = 4, + kCompaction = 5, + // All callers should be added before kMaxBlockCacheLookupCaller. + kMaxBlockCacheLookupCaller +}; + +enum Boolean : char { kTrue = 1, kFalse = 0 }; + +struct BlockCacheTraceRecord { + // Required fields for all accesses. + uint64_t access_timestamp; + std::string block_key; + TraceType block_type; + uint64_t block_size; + uint32_t cf_id; + std::string cf_name; + uint32_t level; + uint32_t sst_fd_number; + BlockCacheLookupCaller caller; + Boolean is_cache_hit; + Boolean no_insert; + + // Required fields for data block and user Get/Multi-Get only. + std::string referenced_key; + uint64_t num_keys_in_block = 0; + Boolean is_referenced_key_exist_in_block = Boolean::kFalse; +}; + +struct BlockCacheTraceHeader { + uint64_t start_time; + uint32_t rocksdb_major_version; + uint32_t rocksdb_minor_version; +}; + +// BlockCacheTraceWriter captures all RocksDB block cache accesses using a +// user-provided TraceWriter. Every RocksDB operation is written as a single +// trace. Each trace will have a timestamp and type, followed by the trace +// payload. +class BlockCacheTraceWriter { + public: + BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer); + ~BlockCacheTraceWriter() = default; + // No copy and move. + BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete; + BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete; + BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete; + BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete; + + Status WriteBlockAccess(const BlockCacheTraceRecord& record); + + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number and RocksDB version. + Status WriteHeader(); + + private: + bool ShouldTrace(const BlockCacheTraceRecord& record) const; + + Env* env_; + TraceOptions trace_options_; + std::unique_ptr trace_writer_; + /*Mutex to protect trace_writer_ */ + InstrumentedMutex trace_writer_mutex_; +}; + +// BlockCacheTraceReader helps read the trace file generated by +// BlockCacheTraceWriter using a user provided TraceReader. +class BlockCacheTraceReader { + public: + BlockCacheTraceReader(std::unique_ptr&& reader); + ~BlockCacheTraceReader() = default; + // No copy and move. + BlockCacheTraceReader(const BlockCacheTraceReader&) = delete; + BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete; + BlockCacheTraceReader(BlockCacheTraceReader&&) = delete; + BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete; + + Status ReadHeader(BlockCacheTraceHeader* header); + + Status ReadAccess(BlockCacheTraceRecord* record); + + private: + std::unique_ptr trace_reader_; +}; + +} // namespace rocksdb diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc new file mode 100644 index 00000000000..28052d9db8d --- /dev/null +++ b/trace_replay/block_cache_tracer_test.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/block_cache_tracer.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace rocksdb { + +namespace { +const uint64_t kBlockSize = 1024; +const std::string kBlockKeyPrefix = "test-block-"; +const uint32_t kCFId = 0; +const uint32_t kLevel = 1; +const uint64_t kSSTFDNumber = 100; +const std::string kRefKeyPrefix = "test-get-"; +const uint64_t kNumKeysInBlock = 1024; +} // namespace + +class BlockCacheTracerTest : public testing::Test { + public: + BlockCacheTracerTest() { + test_path_ = test::PerThreadDBPath("block_cache_tracer_test"); + env_ = rocksdb::Env::Default(); + EXPECT_OK(env_->CreateDir(test_path_)); + trace_file_path_ = test_path_ + "/block_cache_trace"; + } + + ~BlockCacheTracerTest() override { + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + EXPECT_OK(env_->DeleteDir(test_path_)); + } + + BlockCacheLookupCaller GetCaller(uint32_t key_id) { + uint32_t n = key_id % 5; + switch (n) { + case 0: + return BlockCacheLookupCaller::kPrefetch; + case 1: + return BlockCacheLookupCaller::kCompaction; + case 2: + return BlockCacheLookupCaller::kUserGet; + case 3: + return BlockCacheLookupCaller::kUserMGet; + case 4: + return BlockCacheLookupCaller::kUserIterator; + } + assert(false); + } + + void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id, + TraceType block_type, uint32_t nblocks) { + assert(writer); + for (uint32_t i = 0; i < nblocks; i++) { + uint32_t key_id = from_key_id + i; + BlockCacheTraceRecord record; + record.block_type = block_type; + record.block_size = kBlockSize + key_id; + record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.access_timestamp = env_->NowMicros(); + record.cf_id = kCFId; + record.cf_name = kDefaultColumnFamilyName; + record.caller = GetCaller(key_id); + record.level = kLevel; + record.sst_fd_number = kSSTFDNumber + key_id; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + // Provide these fields for all block types. + // The writer should only write these fields for data blocks and the + // caller is either GET or MGET. + record.referenced_key = kRefKeyPrefix + std::to_string(key_id); + record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.num_keys_in_block = kNumKeysInBlock; + ASSERT_OK(writer->WriteBlockAccess(record)); + } + } + + void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id, + TraceType block_type, uint32_t nblocks) { + assert(reader); + for (uint32_t i = 0; i < nblocks; i++) { + uint32_t key_id = from_key_id + i; + BlockCacheTraceRecord record; + ASSERT_OK(reader->ReadAccess(&record)); + ASSERT_EQ(block_type, record.block_type); + ASSERT_EQ(kBlockSize + key_id, record.block_size); + ASSERT_EQ(kBlockKeyPrefix + std::to_string(key_id), record.block_key); + ASSERT_EQ(kCFId, record.cf_id); + ASSERT_EQ(kDefaultColumnFamilyName, record.cf_name); + ASSERT_EQ(GetCaller(key_id), record.caller); + ASSERT_EQ(kLevel, record.level); + ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number); + ASSERT_EQ(Boolean::kFalse, record.is_cache_hit); + ASSERT_EQ(Boolean::kFalse, record.no_insert); + if (block_type == TraceType::kBlockTraceDataBlock && + (record.caller == BlockCacheLookupCaller::kUserGet || + record.caller == BlockCacheLookupCaller::kUserMGet)) { + ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), + record.referenced_key); + ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block); + ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block); + continue; + } + ASSERT_EQ("", record.referenced_key); + ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block); + ASSERT_EQ(0, record.num_keys_in_block); + } + } + + Env* env_; + EnvOptions env_options_; + std::string trace_file_path_; + std::string test_path_; +}; + +TEST_F(BlockCacheTracerTest, MixedBlocks) { + { + // Generate a trace file containing a mix of blocks. + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer)); + ASSERT_OK(writer.WriteHeader()); + // Write blocks of different types. + WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock, + 10); + WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10); + WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10); + WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10); + WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + + { + // Verify trace file is generated correctly. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, header.rocksdb_major_version); + ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version); + // Read blocks. + VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10); + VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10); + VerifyAccess(&reader, 20, TraceType::kBlockTraceFilterBlock, 10); + VerifyAccess(&reader, 30, TraceType::kBlockTraceIndexBlock, 10); + VerifyAccess(&reader, 40, TraceType::kBlockTraceRangeDeletionBlock, 10); + // Read one more record should report an error. + BlockCacheTraceRecord record; + ASSERT_NOK(reader.ReadAccess(&record)); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc index f9448069b80..b444ab371d9 100644 --- a/trace_replay/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -8,7 +8,6 @@ #include #include #include - #include "db/db_impl/db_impl.h" #include "rocksdb/slice.h" #include "rocksdb/write_batch.h" @@ -32,6 +31,30 @@ void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) { } } // namespace +void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) { + assert(encoded_trace); + PutFixed64(encoded_trace, trace.ts); + encoded_trace->push_back(trace.type); + PutFixed32(encoded_trace, static_cast(trace.payload.size())); + encoded_trace->append(trace.payload); +} + +Status TracerHelper::DecodeTrace(const std::string& encoded_trace, + Trace* trace) { + assert(trace != nullptr); + Slice enc_slice = Slice(encoded_trace); + if (!GetFixed64(&enc_slice, &trace->ts)) { + return Status::Incomplete("Decode trace string failed"); + } + if (enc_slice.size() < kTraceTypeSize + kTracePayloadLengthSize) { + return Status::Incomplete("Decode trace string failed"); + } + trace->type = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize); + trace->payload = enc_slice.ToString(); + return Status::OK(); +} + Tracer::Tracer(Env* env, const TraceOptions& trace_options, std::unique_ptr&& trace_writer) : env_(env), @@ -139,10 +162,7 @@ Status Tracer::WriteFooter() { Status Tracer::WriteTrace(const Trace& trace) { std::string encoded_trace; - PutFixed64(&encoded_trace, trace.ts); - encoded_trace.push_back(trace.type); - PutFixed32(&encoded_trace, static_cast(trace.payload.size())); - encoded_trace.append(trace.payload); + TracerHelper::EncodeTrace(trace, &encoded_trace); return trace_writer_->Write(Slice(encoded_trace)); } @@ -302,13 +322,7 @@ Status Replayer::ReadTrace(Trace* trace) { if (!s.ok()) { return s; } - - Slice enc_slice = Slice(encoded_trace); - GetFixed64(&enc_slice, &trace->ts); - trace->type = static_cast(enc_slice[0]); - enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize); - trace->payload = enc_slice.ToString(); - return s; + return TracerHelper::DecodeTrace(encoded_trace, trace); } } // namespace rocksdb diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h index d4030c61518..d6956317096 100644 --- a/trace_replay/trace_replay.h +++ b/trace_replay/trace_replay.h @@ -40,6 +40,12 @@ enum TraceType : char { kTraceGet = 4, kTraceIteratorSeek = 5, kTraceIteratorSeekForPrev = 6, + // Block cache related types. + kBlockTraceIndexBlock = 7, + kBlockTraceFilterBlock = 8, + kBlockTraceDataBlock = 9, + kBlockTraceUncompressionDictBlock = 10, + kBlockTraceRangeDeletionBlock = 11, // All trace types should be added before kTraceMax kTraceMax, }; @@ -60,6 +66,15 @@ struct Trace { } }; +class TracerHelper { + public: + // Encode a trace object into the given string. + static void EncodeTrace(const Trace& trace, std::string* encoded_trace); + + // Decode a string into the given trace object. + static Status DecodeTrace(const std::string& encoded_trace, Trace* trace); +}; + // Tracer captures all RocksDB operations using a user-provided TraceWriter. // Every RocksDB operation is written as a single trace. Each trace will have a // timestamp and type, followed by the trace payload. From bee2f48a6607f641701e8971f7df3a711feaf64a Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 6 Jun 2019 11:28:54 -0700 Subject: [PATCH 113/572] Refactor the handling of cache related counters and statistics (#5408) Summary: The patch cleans up the handling of cache hit/miss/insertion related performance counters, get context counters, and statistics by eliminating some code duplication and factoring out the affected logic into separate methods. In addition, it makes the semantics of cache hit metrics more consistent by changing the code so that accessing a partition of partitioned indexes/filters through a pinned reference no longer counts as a cache hit. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5408 Differential Revision: D15610883 Pulled By: ltamasi fbshipit-source-id: ee749c18965077aca971d8f8bee8b24ed8fa76f1 --- HISTORY.md | 1 + table/block_based/block_based_table_reader.cc | 460 ++++++++++-------- table/block_based/block_based_table_reader.h | 58 ++- table/block_based/block_type.h | 24 + table/block_based/partitioned_filter_block.cc | 5 - 5 files changed, 308 insertions(+), 240 deletions(-) create mode 100644 table/block_based/block_type.h diff --git a/HISTORY.md b/HISTORY.md index 028ddcf8253..c88b436e40d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. +* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 37bbc3b52b3..0d7e3cf53a0 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -230,10 +230,10 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const Rep* const rep = table->get_rep(); assert(rep != nullptr); - constexpr bool is_index = true; const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->footer.index_handle(), - UncompressionDict::GetEmptyDict(), index_block, is_index, get_context); + UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, + get_context); return s; } @@ -244,9 +244,7 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( assert(index_block != nullptr); if (!index_block_.IsEmpty()) { - *index_block = - CachableEntry(index_block_.GetValue(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */); + index_block->SetUnownedValue(index_block_.GetValue()); return Status::OK(); } @@ -321,7 +319,6 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } else { ReadOptions ro; ro.fill_cache = read_options.fill_cache; - constexpr bool is_index = true; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. it = new BlockBasedTableIterator( @@ -330,7 +327,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { internal_comparator(), internal_comparator()->user_comparator(), nullptr, kNullStats, true, index_key_includes_seq(), index_value_is_full()), - false, true, /* prefix_extractor */ nullptr, is_index, + false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, index_key_includes_seq(), index_value_is_full()); } @@ -399,12 +396,11 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { for (; biter.Valid(); biter.Next()) { handle = biter.value(); CachableEntry block; - const bool is_index = true; // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, is_index, nullptr /* get_context */); + &block, BlockType::kIndex, nullptr /* get_context */); assert(s.ok() || block.GetValue() == nullptr); if (s.ok() && block.GetValue() != nullptr) { @@ -662,44 +658,188 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { std::unique_ptr prefix_index_; }; +void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_hit; + get_context->get_context_stats_.num_cache_bytes_read += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_HIT); + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage); + } + + switch (block_type) { + case BlockType::kFilter: + PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_HIT); + } + break; + + case BlockType::kCompressionDictionary: + // TODO: introduce perf counter for compression dictionary hit count + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT); + } + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_HIT); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_HIT); + } + break; + } +} + +void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce aggregate (not per-level) block cache miss count + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + } + + // TODO: introduce perf counters for misses per block type + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_MISS); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_MISS); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_MISS); + } + break; + } +} + +void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce perf counters for block cache insertions + if (get_context) { + ++get_context->get_context_stats_.num_cache_add; + get_context->get_context_stats_.num_cache_bytes_write += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); + } + + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_add; + get_context->get_context_stats_.num_cache_filter_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_add; + get_context->get_context_stats_ + .num_cache_compression_dict_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + usage); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_add; + get_context->get_context_stats_.num_cache_index_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_add; + get_context->get_context_stats_.num_cache_data_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage); + } + break; + } +} + Cache::Handle* BlockBasedTable::GetEntryFromCache( - Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker, - Tickers block_cache_hit_ticker, uint64_t* block_cache_miss_stats, - uint64_t* block_cache_hit_stats, Statistics* statistics, + Cache* block_cache, const Slice& key, BlockType block_type, GetContext* get_context) const { - auto cache_handle = block_cache->Lookup(key, statistics); + auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics); + if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, - static_cast(rep_->level)); - if (get_context != nullptr) { - // overall cache hit - get_context->get_context_stats_.num_cache_hit++; - // total bytes read from cache - get_context->get_context_stats_.num_cache_bytes_read += - block_cache->GetUsage(cache_handle); - // block-type specific cache hit - (*block_cache_hit_stats)++; - } else { - // overall cache hit - RecordTick(statistics, BLOCK_CACHE_HIT); - // total bytes read from cache - RecordTick(statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(cache_handle)); - RecordTick(statistics, block_cache_hit_ticker); - } + UpdateCacheHitMetrics(block_type, get_context, + block_cache->GetUsage(cache_handle)); } else { - PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, - static_cast(rep_->level)); - if (get_context != nullptr) { - // overall cache miss - get_context->get_context_stats_.num_cache_miss++; - // block-type specific cache miss - (*block_cache_miss_stats)++; - } else { - RecordTick(statistics, BLOCK_CACHE_MISS); - RecordTick(statistics, block_cache_miss_ticker); - } + UpdateCacheMissMetrics(block_type, get_context); } return cache_handle; @@ -1170,7 +1310,7 @@ Status BlockBasedTable::ReadRangeDelBlock( ReadOptions read_options; std::unique_ptr iter(NewDataBlockIterator( read_options, range_del_handle, nullptr /* input_iter */, - false /* is_index */, true /* key_includes_seq */, + BlockType::kRangeDeletion, true /* key_includes_seq */, true /* index_key_is_full */, nullptr /* get_context */, Status(), prefetch_buffer)); assert(iter != nullptr); @@ -1433,38 +1573,24 @@ Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, bool is_index, + const UncompressionDict& uncompression_dict, BlockType block_type, GetContext* get_context) const { const size_t read_amp_bytes_per_bit = - !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0; + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; assert(block); assert(block->IsEmpty()); Status s; BlockContents* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; - Statistics* statistics = rep_->ioptions.statistics; // Lookup uncompressed cache first if (block_cache != nullptr) { - auto cache_handle = GetEntryFromCache( - block_cache, block_cache_key, - is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, - is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, - get_context - ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss - : &get_context->get_context_stats_.num_cache_data_miss) - : nullptr, - get_context - ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit - : &get_context->get_context_stats_.num_cache_data_hit) - : nullptr, - statistics, get_context); + auto cache_handle = GetEntryFromCache(block_cache, block_cache_key, + block_type, get_context); if (cache_handle != nullptr) { - if (is_index) { - PERF_COUNTER_ADD(block_cache_index_hit_count, 1); - } - block->SetCachedValue( reinterpret_cast(block_cache->Value(cache_handle)), block_cache, cache_handle); @@ -1482,6 +1608,9 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(!compressed_block_cache_key.empty()); block_cache_compressed_handle = block_cache_compressed->Lookup(compressed_block_cache_key); + + Statistics* statistics = rep_->ioptions.statistics; + // if we found in the compressed cache, then uncompress and insert into // uncompressed cache if (block_cache_compressed_handle == nullptr) { @@ -1508,7 +1637,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { std::unique_ptr block_holder( - new Block(std::move(contents), rep_->get_global_seqno(is_index), + new Block(std::move(contents), rep_->get_global_seqno(block_type), read_amp_bytes_per_bit, statistics)); // uncompressed block if (block_cache != nullptr && block_holder->own_bytes() && @@ -1526,32 +1655,7 @@ Status BlockBasedTable::GetDataBlockFromCache( block->SetCachedValue(block_holder.release(), block_cache, cache_handle); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - if (is_index) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_index_add++; - get_context->get_context_stats_.num_cache_index_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } - } else { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_data_add++; - get_context->get_context_stats_.num_cache_data_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); - } - } + UpdateCacheInsertionMetrics(block_type, get_context, charge); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1571,15 +1675,19 @@ Status BlockBasedTable::PutDataBlockToCache( CachableEntry* cached_block, BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - MemoryAllocator* memory_allocator, bool is_index, + MemoryAllocator* memory_allocator, BlockType block_type, GetContext* get_context) const { const ImmutableCFOptions& ioptions = rep_->ioptions; const uint32_t format_version = rep_->table_options.format_version; const size_t read_amp_bytes_per_bit = - !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0; + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; const Cache::Priority priority = - is_index && rep_->table_options - .cache_index_and_filter_blocks_with_high_priority + rep_->table_options.cache_index_and_filter_blocks_with_high_priority && + (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary || + block_type == BlockType::kIndex) ? Cache::Priority::HIGH : Cache::Priority::LOW; assert(cached_block); @@ -1652,33 +1760,7 @@ Status BlockBasedTable::PutDataBlockToCache( cached_block->SetCachedValue(block_holder.release(), block_cache, cache_handle); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - if (is_index) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_index_add++; - get_context->get_context_stats_.num_cache_index_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } - } else { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_data_add++; - get_context->get_context_stats_.num_cache_data_bytes_insert += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); - } - } - assert(reinterpret_cast(block_cache->Value( - cached_block->GetCacheHandle())) == cached_block->GetValue()); + UpdateCacheInsertionMetrics(block_type, get_context, charge); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1798,18 +1880,11 @@ CachableEntry BlockBasedTable::GetFilter( auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, filter_blk_handle, cache_key); - Statistics* statistics = rep_->ioptions.statistics; - Cache::Handle* cache_handle = GetEntryFromCache( - block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, - get_context ? &get_context->get_context_stats_.num_cache_filter_miss - : nullptr, - get_context ? &get_context->get_context_stats_.num_cache_filter_hit - : nullptr, - statistics, get_context); + Cache::Handle* cache_handle = + GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context); FilterBlockReader* filter = nullptr; if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); filter = reinterpret_cast(block_cache->Value(cache_handle)); } else if (no_io) { @@ -1827,20 +1902,9 @@ CachableEntry BlockBasedTable::GetFilter( : Cache::Priority::LOW); if (s.ok()) { PERF_COUNTER_ADD(filter_block_read_count, 1); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += usage; - get_context->get_context_stats_.num_cache_filter_add++; - get_context->get_context_stats_.num_cache_filter_bytes_insert += - usage; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); - RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); - RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); - } + UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage); } else { - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); delete filter; return CachableEntry(); } @@ -1867,16 +1931,9 @@ CachableEntry BlockBasedTable::GetUncompressionDict( auto cache_key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->compression_dict_handle, cache_key_buf); - auto cache_handle = GetEntryFromCache( - rep_->table_options.block_cache.get(), cache_key, - BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT, - get_context - ? &get_context->get_context_stats_.num_cache_compression_dict_miss - : nullptr, - get_context - ? &get_context->get_context_stats_.num_cache_compression_dict_hit - : nullptr, - rep_->ioptions.statistics, get_context); + auto cache_handle = + GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key, + BlockType::kCompressionDictionary, get_context); UncompressionDict* dict = nullptr; if (cache_handle != nullptr) { dict = reinterpret_cast( @@ -1887,43 +1944,31 @@ CachableEntry BlockBasedTable::GetUncompressionDict( std::unique_ptr compression_dict_block; Status s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); - size_t usage = 0; if (s.ok()) { assert(compression_dict_block != nullptr); // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - dict = new UncompressionDict(compression_dict_block->data.ToString(), - rep_->blocks_definitely_zstd_compressed, - rep_->ioptions.statistics); - usage = dict->ApproximateMemoryUsage(); + std::unique_ptr uncompression_dict( + new UncompressionDict(compression_dict_block->data.ToString(), + rep_->blocks_definitely_zstd_compressed, + rep_->ioptions.statistics)); + const size_t usage = uncompression_dict->ApproximateMemoryUsage(); s = rep_->table_options.block_cache->Insert( - cache_key, dict, usage, &DeleteCachedUncompressionDictEntry, - &cache_handle, + cache_key, uncompression_dict.get(), usage, + &DeleteCachedUncompressionDictEntry, &cache_handle, rep_->table_options.cache_index_and_filter_blocks_with_high_priority ? Cache::Priority::HIGH : Cache::Priority::LOW); - } - if (s.ok()) { - PERF_COUNTER_ADD(compression_dict_block_read_count, 1); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += usage; - get_context->get_context_stats_.num_cache_compression_dict_add++; - get_context->get_context_stats_ - .num_cache_compression_dict_bytes_insert += usage; + + if (s.ok()) { + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary, + get_context, usage); + dict = uncompression_dict.release(); } else { - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD); - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage); - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); - RecordTick(rep_->ioptions.statistics, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage); + RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); + assert(dict == nullptr); + assert(cache_handle == nullptr); } - } else { - // There should be no way to get here if block cache insertion succeeded. - // Though it is still possible something failed earlier. - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); - delete dict; - dict = nullptr; - assert(cache_handle == nullptr); } } return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, @@ -1951,7 +1996,7 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( template TBlockIter* BlockBasedTable::NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, - bool is_index, bool key_includes_seq, bool index_key_is_full, + BlockType block_type, bool key_includes_seq, bool index_key_is_full, GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -1972,7 +2017,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( CachableEntry block; s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, - is_index, get_context); + block_type, get_context); if (!s.ok()) { assert(block.IsEmpty()); @@ -2037,7 +2082,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); @@ -2070,7 +2115,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - ro, block_entry, uncompression_dict, is_index, + ro, block_entry, uncompression_dict, block_type, get_context); // Can't find the block from the cache. If I/O is allowed, read from the @@ -2095,14 +2140,14 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } if (s.ok()) { - SequenceNumber seq_no = rep_->get_global_seqno(is_index); + SequenceNumber seq_no = rep_->get_global_seqno(block_type); // If filling cache is allowed and a cache is configured, try to put the // block to the cache. s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, block_entry, &raw_block_contents, raw_block_comp_type, uncompression_dict, seq_no, GetMemoryAllocator(rep_->table_options), - is_index, get_context); + block_type, get_context); } } } @@ -2113,16 +2158,19 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context) const { assert(block_entry); assert(block_entry->IsEmpty()); Status s; - if (!is_index || rep_->table_options.cache_index_and_filter_blocks) { + if (rep_->table_options.cache_index_and_filter_blocks || + (block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + block_type != BlockType::kIndex)) { s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, - uncompression_dict, block_entry, is_index, - get_context); + uncompression_dict, block_entry, + block_type, get_context); if (!s.ok()) { return s; @@ -2150,8 +2198,10 @@ Status BlockBasedTable::RetrieveBlock( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, rep_->ioptions, rep_->blocks_maybe_compressed, rep_->blocks_maybe_compressed, uncompression_dict, - rep_->persistent_cache_options, rep_->get_global_seqno(is_index), - !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0, + rep_->persistent_cache_options, rep_->get_global_seqno(block_type), + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0, GetMemoryAllocator(rep_->table_options)); } @@ -2178,18 +2228,13 @@ InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( const BlockHandle& handle) { // Return a block iterator on the index partition - auto rep = table_->get_rep(); auto block = block_map_->find(handle.offset()); // This is a possible scenario since block cache might not have had space // for the partition if (block != block_map_->end()) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT); - Cache* block_cache = rep->table_options.block_cache.get(); - assert(block_cache); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(block->second.GetCacheHandle())); + auto rep = table_->get_rep(); + assert(rep); + Statistics* kNullStats = nullptr; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. @@ -2531,7 +2576,7 @@ void BlockBasedTableIterator::InitDataBlock() { Status s; table_->NewDataBlockIterator( - read_options_, data_block_handle, &block_iter_, is_index_, + read_options_, data_block_handle, &block_iter_, block_type_, key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; @@ -2623,7 +2668,6 @@ InternalIterator* BlockBasedTable::NewIterator( Arena* arena, bool skip_filters, bool for_compaction) { bool need_upper_bound_check = PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); - const bool kIsNotIndex = false; if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, @@ -2633,7 +2677,7 @@ InternalIterator* BlockBasedTable::NewIterator( rep_->index_type == BlockBasedTableOptions::kHashSearch), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, kIsNotIndex, + need_upper_bound_check, prefix_extractor, BlockType::kData, true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); } else { auto* mem = @@ -2643,7 +2687,7 @@ InternalIterator* BlockBasedTable::NewIterator( NewIndexIterator(read_options, need_upper_bound_check), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, kIsNotIndex, + need_upper_bound_check, prefix_extractor, BlockType::kData, true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); } } @@ -2780,7 +2824,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } else { DataBlockIter biter; NewDataBlockIterator( - read_options, iiter->value(), &biter, false, + read_options, iiter->value(), &biter, BlockType::kData, true /* key_includes_seq */, true /* index_key_is_full */, get_context); @@ -2893,7 +2937,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { DataBlockIter biter; NewDataBlockIterator( - read_options, iiter->value(), &biter, false, + read_options, iiter->value(), &biter, BlockType::kData, true /* key_includes_seq */, get_context); if (read_options.read_tier == kBlockCacheTier && diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index e53248fbcba..d8319a3e711 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -25,6 +25,7 @@ #include "rocksdb/table.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_type.h" #include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" #include "table/format.h" @@ -220,8 +221,8 @@ class BlockBasedTable : public TableReader { // input_iter: if it is not null, update this one and return it as Iterator template TBlockIter* NewDataBlockIterator( - const ReadOptions& ro, const BlockHandle& block_hanlde, - TBlockIter* input_iter = nullptr, bool is_index = false, + const ReadOptions& ro, const BlockHandle& block_handle, + TBlockIter* input_iter = nullptr, BlockType block_type = BlockType::kData, bool key_includes_seq = true, bool index_key_is_full = true, GetContext* get_context = nullptr, Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr) const; @@ -238,12 +239,14 @@ class BlockBasedTable : public TableReader { friend class MockedBlockBasedTable; static std::atomic next_cache_key_id_; + void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, + size_t usage) const; + void UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const; + void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage) const; Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, - Tickers block_cache_miss_ticker, - Tickers block_cache_hit_ticker, - uint64_t* block_cache_miss_stats, - uint64_t* block_cache_hit_stats, - Statistics* statistics, + BlockType block_type, GetContext* get_context) const; // If block cache enabled (compressed or uncompressed), looks for the block @@ -258,7 +261,7 @@ class BlockBasedTable : public TableReader { Status MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index = false, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context = nullptr) const; // Similar to the above, with one crucial difference: it will retrieve the @@ -267,7 +270,7 @@ class BlockBasedTable : public TableReader { Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context) const; // For the following two functions: @@ -311,7 +314,7 @@ class BlockBasedTable : public TableReader { const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, bool is_index = false, + const UncompressionDict& uncompression_dict, BlockType block_type, GetContext* get_context = nullptr) const; // Put a raw block (maybe compressed) to the corresponding block caches. @@ -324,16 +327,14 @@ class BlockBasedTable : public TableReader { // PutDataBlockToCache(). After the call, the object will be invalid. // @param uncompression_dict Data for presetting the compression library's // dictionary. - Status PutDataBlockToCache(const Slice& block_cache_key, - const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, - BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, - const UncompressionDict& uncompression_dict, - SequenceNumber seq_no, - MemoryAllocator* memory_allocator, bool is_index, - GetContext* get_context) const; + Status PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + CachableEntry* cached_block, BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + MemoryAllocator* memory_allocator, BlockType block_type, + GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -546,8 +547,12 @@ struct BlockBasedTable::Rep { bool closed = false; const bool immortal_table; - SequenceNumber get_global_seqno(bool is_index) const { - return is_index ? kDisableGlobalSequenceNumber : global_seqno; + SequenceNumber get_global_seqno(BlockType block_type) const { + return (block_type == BlockType::kFilter || + block_type == BlockType::kIndex || + block_type == BlockType::kCompressionDictionary) + ? kDisableGlobalSequenceNumber + : global_seqno; } }; @@ -560,8 +565,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { const InternalKeyComparator& icomp, InternalIteratorBase* index_iter, bool check_filter, bool need_upper_bound_check, - const SliceTransform* prefix_extractor, bool is_index, - bool key_includes_seq = true, + const SliceTransform* prefix_extractor, + BlockType block_type, bool key_includes_seq = true, bool index_key_is_full = true, bool for_compaction = false) : InternalIteratorBase(false), @@ -575,7 +580,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), prefix_extractor_(prefix_extractor), - is_index_(is_index), + block_type_(block_type), key_includes_seq_(key_includes_seq), index_key_is_full_(index_key_is_full), for_compaction_(for_compaction) {} @@ -690,8 +695,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; - // If the blocks over which we iterate are index blocks - bool is_index_; + BlockType block_type_; // If the keys in the blocks over which we iterate include 8 byte sequence bool key_includes_seq_; bool index_key_is_full_; diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h new file mode 100644 index 00000000000..9b9c53946c9 --- /dev/null +++ b/table/block_based/block_type.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { + +// Represents the types of blocks used in the block based table format. +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details. + +enum class BlockType : uint8_t { + kData, + kFilter, + kProperties, + kCompressionDictionary, + kRangeDeletion, + kMetaIndex, + kIndex, +}; + +} // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 315e63306f1..7874ce1874f 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -243,11 +243,6 @@ PartitionedFilterBlockReader::GetFilterPartition( // This is a possible scenario since block cache might not have had space // for the partition if (iter != filter_map_.end()) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT); - RecordTick(statistics(), BLOCK_CACHE_HIT); - RecordTick(statistics(), BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(iter->second.GetCacheHandle())); return {iter->second.GetValue(), nullptr /* cache */, nullptr /* cache_handle */, false /* own_value */}; } From d68f9f4580f083023f8e20939b2866cac48f9bb6 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 6 Jun 2019 13:52:39 -0700 Subject: [PATCH 114/572] simplify include directive involving inttypes (#5402) Summary: When using `PRIu64` type of printf specifier, current code base does the following: ``` #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif #include ``` However, this can be simplified to ``` #include ``` as long as flag `-std=c++11` is used. This should solve issues like https://github.com/facebook/rocksdb/issues/5159 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5402 Differential Revision: D15701195 Pulled By: miasantreble fbshipit-source-id: 6dac0a05f52aadb55e9728038599d3d2e4b59d03 --- cache/cache_bench.cc | 5 +---- cache/lru_cache.cc | 4 ---- cache/sharded_cache.cc | 4 ---- db/column_family.cc | 6 +----- db/compaction/compaction.cc | 6 +----- db/compaction/compaction_job.cc | 7 +------ db/compaction/compaction_job_stats_test.cc | 6 +----- db/compaction/compaction_job_test.cc | 6 +----- db/compaction/compaction_picker.cc | 6 +----- db/compaction/compaction_picker_fifo.cc | 6 +----- db/compaction/compaction_picker_level.cc | 4 ---- db/compaction/compaction_picker_universal.cc | 6 +----- db/corruption_test.cc | 2 +- db/db_filesnapshot.cc | 6 +----- db/db_impl/db_impl.cc | 3 --- db/db_impl/db_impl_compaction_flush.cc | 5 +---- db/db_impl/db_impl_experimental.cc | 6 +----- db/db_impl/db_impl_files.cc | 5 +---- db/db_impl/db_impl_open.cc | 5 +---- db/db_impl/db_impl_secondary.cc | 5 +---- db/db_impl/db_impl_write.cc | 5 +---- db/db_info_dumper.cc | 6 +----- db/db_test_util.h | 6 +----- db/dbformat.cc | 6 +----- db/external_sst_file_ingestion_job.cc | 6 +----- db/flush_job.cc | 6 +----- db/forward_iterator_bench.cc | 4 ---- db/internal_stats.cc | 6 +----- db/memtable_list.cc | 6 +----- db/range_tombstone_fragmenter.cc | 2 +- db/repair.cc | 6 +----- db/transaction_log_impl.cc | 5 +---- db/version_builder.cc | 6 +----- db/version_set.cc | 6 +----- db/wal_manager.cc | 6 +----- examples/multi_processes_example.cc | 2 +- file/delete_scheduler_test.cc | 6 +----- file/filename.cc | 6 +----- file/sst_file_manager_impl.cc | 6 +----- include/rocksdb/utilities/backupable_db.h | 6 +----- logging/event_logger.cc | 6 +----- memtable/memtablerep_bench.cc | 4 ---- monitoring/histogram.cc | 6 +----- monitoring/statistics.cc | 6 +----- options/cf_options.cc | 6 +----- options/db_options.cc | 6 +----- options/options.cc | 6 +----- options/options_settable_test.cc | 4 ---- options/options_test.cc | 6 +----- table/block_based/block_based_table_factory.cc | 7 +------ table/block_based/index_builder.cc | 2 +- table/block_based/index_builder.h | 2 +- table/block_fetcher.cc | 2 +- table/cuckoo/cuckoo_table_reader_test.cc | 6 +----- table/format.cc | 2 +- table/plain/plain_table_index.cc | 6 +----- table/sst_file_reader_test.cc | 2 +- test_util/transaction_test_util.cc | 6 +----- tools/db_bench.cc | 4 ---- tools/db_bench_tool.cc | 6 +----- tools/db_stress.cc | 6 +----- tools/dump/db_dump_tool.cc | 6 +----- tools/ldb_cmd.cc | 6 +----- tools/sst_dump_tool.cc | 6 +----- tools/trace_analyzer_tool.cc | 4 ---- tools/write_stress.cc | 6 +----- util/crc32c_arm64.h | 2 +- util/crc32c_ppc.c | 2 +- util/duplicate_detector.h | 6 +----- util/dynamic_bloom_test.cc | 6 +----- util/rate_limiter_test.cc | 6 +----- util/string_util.cc | 6 +----- utilities/backupable/backupable_db.cc | 6 +----- utilities/blob_db/blob_db.cc | 6 +----- utilities/blob_db/blob_dump_tool.cc | 6 +----- utilities/blob_db/blob_file.cc | 6 +----- utilities/checkpoint/checkpoint_impl.cc | 6 +----- utilities/options/options_util_test.cc | 5 +---- utilities/persistent_cache/persistent_cache_tier.cc | 7 +------ utilities/transactions/pessimistic_transaction_db.cc | 6 +----- utilities/transactions/transaction_base.cc | 6 +----- utilities/transactions/transaction_lock_mgr.cc | 6 +----- utilities/transactions/transaction_test.cc | 4 ---- utilities/transactions/transaction_test.h | 6 +----- utilities/transactions/transaction_util.cc | 6 +----- utilities/transactions/write_prepared_transaction_test.cc | 6 +----- utilities/transactions/write_prepared_txn.cc | 6 +----- utilities/transactions/write_prepared_txn_db.cc | 6 +----- utilities/transactions/write_prepared_txn_db.h | 6 +----- .../transactions/write_unprepared_transaction_test.cc | 4 ---- utilities/transactions/write_unprepared_txn.cc | 4 ---- utilities/transactions/write_unprepared_txn_db.cc | 4 ---- utilities/transactions/write_unprepared_txn_db.h | 4 ---- 93 files changed, 79 insertions(+), 405 deletions(-) diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc index 098813d9d74..35deb200596 100644 --- a/cache/cache_bench.cc +++ b/cache/cache_bench.cc @@ -3,9 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #ifndef GFLAGS #include int main() { @@ -14,7 +11,7 @@ int main() { } #else -#include +#include #include #include diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index fdcbb4e86cb..676bed3051c 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "cache/lru_cache.h" #include diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index a48a32185bf..8fc0a7a17a3 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "cache/sharded_cache.h" #include diff --git a/db/column_family.cc b/db/column_family.cc index 531cbeca681..2a2e6cb980f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -9,11 +9,7 @@ #include "db/column_family.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 5dc7e83c8fc..6d7a3561660 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -7,11 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "db/column_family.h" diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 5761345d8a2..ca8575a0dc9 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -7,12 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index 5fb805df5f0..221ee3eaad3 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -7,11 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 838cda5eaca..66c3353fcf6 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -5,11 +5,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index a03f7b46fd1..3357e06319d 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -9,11 +9,7 @@ #include "db/compaction/compaction_picker.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 1fc6ed113d2..4ff301d21c3 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -10,11 +10,7 @@ #include "db/compaction/compaction_picker_fifo.h" #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include "db/column_family.h" diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index e9653da8e55..cc0f19b8171 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include #include #include diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index e8aca00be81..5909ab576c3 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -10,11 +10,7 @@ #include "db/compaction/compaction_picker_universal.h" #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 9e83c9080e6..82752161f39 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include "db/db_impl/db_impl.h" diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index ac544793ee4..3ff7c73f4e8 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -6,11 +6,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 96b911a6d37..bb6ec7db4c5 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -8,9 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #include #ifdef OS_SOLARIS #include diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 7be9b62c5d6..bd1a8e74f48 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -8,10 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/builder.h" #include "db/error_handler.h" diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index a8fed40be01..f0e6fafccba 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -9,11 +9,7 @@ #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "db/column_family.h" diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 608c8ce4948..c018432c9b8 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -8,10 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include #include #include "db/event_helpers.h" diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 2fc12746d7d..69c9c4117d7 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -8,10 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/builder.h" #include "db/error_handler.h" diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 34364d124a8..827d99929a9 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -5,10 +5,7 @@ #include "db/db_impl/db_impl_secondary.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/db_iter.h" #include "db/merge_context.h" diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 947194ace19..7ff2982d147 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -8,10 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/error_handler.h" #include "db/event_helpers.h" #include "monitoring/perf_context_imp.h" diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index be85357c2e1..e2bb01e0e97 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -3,13 +3,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "db/db_info_dumper.h" -#include +#include #include #include #include diff --git a/db/db_test_util.h b/db/db_test_util.h index 4e9fcafadfa..6e1d0ed7a13 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -9,12 +9,8 @@ #pragma once -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include -#include +#include #include #include diff --git a/db/dbformat.cc b/db/dbformat.cc index cd2878198c4..bfaea868b53 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -8,11 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "monitoring/perf_context_imp.h" #include "port/port.h" diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index aec398552c7..0068685b0ba 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -7,11 +7,7 @@ #include "db/external_sst_file_ingestion_job.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/flush_job.cc b/db/flush_job.cc index 2b2696c10ba..589d81f2974 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -9,11 +9,7 @@ #include "db/flush_job.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 17b0ca16544..174a258a682 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -3,10 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #if !defined(GFLAGS) || defined(ROCKSDB_LITE) #include int main() { diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 21dde297ab6..50f6ed2e688 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -10,11 +10,7 @@ #include "db/internal_stats.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 045bfc9a2d3..0f796eb9a73 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -5,11 +5,7 @@ // #include "db/memtable_list.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index e3eb18908a5..3d3a5c4520f 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include "util/autovector.h" diff --git a/db/repair.cc b/db/repair.cc index 400e754ba45..6967a46e36c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -60,11 +60,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "db/builder.h" #include "db/db_impl/db_impl.h" #include "db/dbformat.h" diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index f92d563eb8e..2e4475bb6ac 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -4,12 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #include "db/transaction_log_impl.h" -#include +#include #include "db/write_batch_internal.h" #include "util/file_reader_writer.h" diff --git a/db/version_builder.cc b/db/version_builder.cc index 84e4dc6579a..9d2ba9ab4ee 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -9,11 +9,7 @@ #include "db/version_builder.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/version_set.cc b/db/version_set.cc index ed9a316ac72..96bf22e57b4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,11 +9,7 @@ #include "db/version_set.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 71c2ffe4b22..58671d599c5 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -9,11 +9,7 @@ #include "db/wal_manager.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc index b1c1d02ba25..7350e1be253 100644 --- a/examples/multi_processes_example.cc +++ b/examples/multi_processes_example.cc @@ -14,7 +14,7 @@ // run for a while, tailing the logs of the primary. After process with primary // instance exits, this process will keep running until you hit 'CTRL+C'. -#include +#include #include #include #include diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 510753b3b45..3549a9f84eb 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -3,11 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/file/filename.cc b/file/filename.cc index c9f22e585b7..d4f7dd9ec7c 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -6,12 +6,8 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "file/filename.h" -#include +#include #include #include diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index efd9e30e6a5..08ea873258a 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -5,11 +5,7 @@ #include "file/sst_file_manager_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "db/db_impl/db_impl.h" diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 7817c564965..1ca4fc9a670 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,11 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/logging/event_logger.cc b/logging/event_logger.cc index aceccdf93c0..182e282b2f0 100644 --- a/logging/event_logger.cc +++ b/logging/event_logger.cc @@ -3,13 +3,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "logging/event_logger.h" -#include +#include #include #include #include diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 003d59b2a86..1e2b5bdd1e5 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #ifndef GFLAGS #include int main() { diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc index 4bc7139d304..29bf78ad7c9 100644 --- a/monitoring/histogram.cc +++ b/monitoring/histogram.cc @@ -7,13 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "monitoring/histogram.h" -#include +#include #include #include #include diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index fe2f2e25af3..15d702d1f4a 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -5,11 +5,7 @@ // #include "monitoring/statistics.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "rocksdb/statistics.h" #include "port/likely.h" #include diff --git a/options/cf_options.cc b/options/cf_options.cc index f7af3f834c9..5830fc6613d 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -5,11 +5,7 @@ #include "options/cf_options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/options/db_options.cc b/options/db_options.cc index 72e348b3227..bdcdd250a0a 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -5,11 +5,7 @@ #include "options/db_options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "logging/logging.h" #include "port/port.h" diff --git a/options/options.cc b/options/options.cc index 8977b58905f..1d2b6193cbc 100644 --- a/options/options.cc +++ b/options/options.cc @@ -9,11 +9,7 @@ #include "rocksdb/options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "monitoring/statistics.h" diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 2e21a2688f8..6044cc4b1c4 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include #include "options/options_helper.h" diff --git a/options/options_test.cc b/options/options_test.cc index 1aa3bace7dd..9fcd241d70f 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -7,14 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include #include #include -#include +#include #include "cache/lru_cache.h" #include "cache/sharded_cache.h" diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 121cc916e25..cf205be72de 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -7,12 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index 738b9e3e099..c1ce541ae56 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -10,7 +10,7 @@ #include "table/block_based/index_builder.h" #include -#include +#include #include #include diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index 7e6a4bb0776..6baa9891b1d 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -10,7 +10,7 @@ #pragma once #include -#include +#include #include #include diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 72b567fc23d..afcbbaee4f5 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -9,7 +9,7 @@ #include "table/block_fetcher.h" -#include +#include #include #include "logging/logging.h" diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index 681e0dfdf3e..dd65ffe8490 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -13,11 +13,7 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/table/format.cc b/table/format.cc index a4441fe5646..2046903a703 100644 --- a/table/format.cc +++ b/table/format.cc @@ -9,7 +9,7 @@ #include "table/format.h" -#include +#include #include #include "block_fetcher.h" diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc index 196be22cfe9..b4207f348cb 100644 --- a/table/plain/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -5,11 +5,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "table/plain/plain_table_index.h" #include "util/coding.h" diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 529634ccd75..dd7a5101677 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE -#include +#include #include "rocksdb/db.h" #include "rocksdb/sst_file_reader.h" diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc index 3a7d9e97f50..b71ad0a1f56 100644 --- a/test_util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -4,13 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "test_util/transaction_test_util.h" -#include +#include #include #include #include diff --git a/tools/db_bench.cc b/tools/db_bench.cc index 634bbba30ac..1ad77295fa6 100644 --- a/tools/db_bench.cc +++ b/tools/db_bench.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #ifndef GFLAGS #include int main() { diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index c6f19bed585..b254978c5ed 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #ifdef GFLAGS #ifdef NUMA #include @@ -20,7 +16,7 @@ #include #endif #include -#include +#include #include #include #include diff --git a/tools/db_stress.cc b/tools/db_stress.cc index dc8f8152376..5fd84258b1f 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -28,12 +28,8 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif // __STDC_FORMAT_MACROS - #include -#include +#include #include #include #include diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc index 8c5fa82e5b9..06a47ce725b 100644 --- a/tools/dump/db_dump_tool.cc +++ b/tools/dump/db_dump_tool.cc @@ -5,11 +5,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "rocksdb/db.h" diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index d6f9b415707..958d862fd32 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -7,11 +7,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/ldb_cmd.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "db/db_impl/db_impl.h" #include "db/dbformat.h" diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index aa051da01f5..ed5600194ad 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -8,11 +8,7 @@ #include "tools/sst_dump_tool_imp.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 6ab606f6a6a..627610ae0f4 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -6,10 +6,6 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #ifdef GFLAGS #ifdef NUMA #include diff --git a/tools/write_stress.cc b/tools/write_stress.cc index 8cde31e6b84..95948ef5730 100644 --- a/tools/write_stress.cc +++ b/tools/write_stress.cc @@ -56,11 +56,7 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif // __STDC_FORMAT_MACROS - -#include +#include #include #include #include diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index 0e77ecd0ef5..80b3aca361a 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -6,7 +6,7 @@ #ifndef UTIL_CRC32C_ARM64_H #define UTIL_CRC32C_ARM64_H -#include +#include #if defined(__aarch64__) || defined(__AARCH64__) #ifdef __ARM_FEATURE_CRC32 diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c index 654d606aaad..ce0b9f27ce6 100644 --- a/util/crc32c_ppc.c +++ b/util/crc32c_ppc.c @@ -6,7 +6,7 @@ // (found in the LICENSE.Apache file in the root directory). #define CRC_TABLE -#include +#include #include #include #include "util/crc32c_ppc_constants.h" diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h index 40a1cbd129b..1fab009751b 100644 --- a/util/duplicate_detector.h +++ b/util/duplicate_detector.h @@ -5,11 +5,7 @@ #pragma once -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "util/set_comparator.h" diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 7ca8bb891aa..3f98ccd0189 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -11,11 +11,7 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index d9f17cc3ac6..7795e01fc9d 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,13 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "util/rate_limiter.h" -#include +#include #include #include diff --git a/util/string_util.cc b/util/string_util.cc index 26e6759ac2a..74f6afbf0f4 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -5,12 +5,8 @@ // #include "util/string_util.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include -#include +#include #include #include #include diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 7a2e1940316..b7592a0ce2b 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -23,11 +23,7 @@ #include "util/string_util.h" #include "utilities/checkpoint/checkpoint_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif // __STDC_FORMAT_MACROS - -#include +#include #include #include #include diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index d660def4908..bee36a667a2 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -5,13 +5,9 @@ // #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/blob_db/blob_db.h" -#include +#include #include "utilities/blob_db/blob_db_impl.h" namespace rocksdb { diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc index 37eee19dbe1..b74a211bc95 100644 --- a/utilities/blob_db/blob_dump_tool.cc +++ b/utilities/blob_db/blob_dump_tool.cc @@ -4,12 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/blob_db/blob_dump_tool.h" -#include +#include #include #include #include diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 03cff7834b9..3f128c7d55e 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -6,11 +6,7 @@ #ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_file.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 7468c8eedee..4835f26da6e 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -11,11 +11,7 @@ #include "utilities/checkpoint/checkpoint_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 8c71dbf5dc3..3926275af5e 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -4,11 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include #include diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc index 732762a1652..752a6fb70b6 100644 --- a/utilities/persistent_cache/persistent_cache_tier.cc +++ b/utilities/persistent_cache/persistent_cache_tier.cc @@ -5,14 +5,9 @@ // #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/persistent_cache/persistent_cache_tier.h" -#include "inttypes.h" - +#include #include #include diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index e906b444ff5..2f9c918a3b4 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/pessimistic_transaction_db.h" -#include +#include #include #include #include diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 6553b49614c..5621a7fa372 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -7,11 +7,7 @@ #include "utilities/transactions/transaction_base.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "db/column_family.h" #include "db/db_impl/db_impl.h" diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index 757b77fde4e..084d817ea08 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/transaction_lock_mgr.h" -#include +#include #include #include diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 6c9f4bccd62..35a9706830e 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -5,10 +5,6 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/transaction_test.h" #include diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index da2a08d3c52..9b634c11ca7 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -5,11 +5,7 @@ #pragma once -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc index c582b73aa3e..407feaaa88a 100644 --- a/utilities/transactions/transaction_util.cc +++ b/utilities/transactions/transaction_util.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/transaction_util.h" -#include +#include #include #include diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 8b52b1ae662..7c588f4ef69 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/transaction_test.h" -#include +#include #include #include #include diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 05650e2b3f9..f55615063e5 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -7,11 +7,7 @@ #include "utilities/transactions/write_prepared_txn.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index e2a8fbbf20f..8e08d074134 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/write_prepared_txn_db.h" -#include +#include #include #include #include diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index ffdf2f29d8f..876279cba23 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -6,11 +6,7 @@ #pragma once #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index 914f3f581e4..faa6c774578 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -5,10 +5,6 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/transaction_test.h" #include "utilities/transactions/write_unprepared_txn.h" #include "utilities/transactions/write_unprepared_txn_db.h" diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index efd766514c8..73e9a8837a0 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -10,10 +10,6 @@ #include "util/cast_util.h" #include "utilities/transactions/write_unprepared_txn_db.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - namespace rocksdb { bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index a1aeedf2e15..ea655f88e3c 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -5,10 +5,6 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/write_unprepared_txn_db.h" #include "rocksdb/utilities/transaction_db.h" #include "util/cast_util.h" diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h index 4b4e31e1b60..fab8ce8263d 100644 --- a/utilities/transactions/write_unprepared_txn_db.h +++ b/utilities/transactions/write_unprepared_txn_db.h @@ -6,10 +6,6 @@ #pragma once #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "utilities/transactions/write_prepared_txn_db.h" #include "utilities/transactions/write_unprepared_txn.h" From fd94353ea36aa6680ef99faab644e23a33599720 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 6 Jun 2019 16:14:51 -0700 Subject: [PATCH 115/572] Remove the artifacts field from stress_crash/stress_crash_with_atomic_flush Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5422 Differential Revision: D15706212 Pulled By: ltamasi fbshipit-source-id: 0acf060fb8568efee51c033e50b492bcf1095a4c --- build_tools/rocksdb-lego-determinator | 9 --------- 1 file changed, 9 deletions(-) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index 2447a19ae44..31bcbad38cb 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -109,13 +109,6 @@ else TASK_CREATION_TOOL="false" fi -ARTIFACTS=" 'artifacts': [ - { - 'name':'database', - 'paths':[ '/dev/shm/rocksdb' ], - } -]" - # # A mechanism to disable tests temporarily # @@ -395,7 +388,6 @@ STRESS_CRASH_TEST_COMMANDS="[ $PARSER } ], - $ARTIFACTS, $REPORT } ]" @@ -424,7 +416,6 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ $PARSER } ], - $ARTIFACTS, $REPORT } ]" From ad52626cf4fd53b1549c4d04ea4c4dae9e4441d9 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 6 Jun 2019 17:30:57 -0700 Subject: [PATCH 116/572] Remove special characters from job names (#5424) Summary: Special characters like slashes and parentheses are not supported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5424 Differential Revision: D15708067 Pulled By: ltamasi fbshipit-source-id: 90527ec3ee882a0cdd1249c3946f5eff2ff7c115 --- build_tools/rocksdb-lego-determinator | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index 31bcbad38cb..e47b2ef30d8 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -369,7 +369,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[ # STRESS_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb Stress/Crash Test', + 'name':'Rocksdb Stress and Crash Test', 'oncall':'$ONCALL', 'timeout': 86400, 'steps': [ @@ -397,7 +397,7 @@ STRESS_CRASH_TEST_COMMANDS="[ # STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb Stress/Crash Test (atomic flush)', + 'name':'Rocksdb Stress and Crash Test with atomic flush', 'oncall':'$ONCALL', 'timeout': 86400, 'steps': [ @@ -489,7 +489,7 @@ ASAN_CRASH_TEST_COMMANDS="[ # ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test (atomic flush) under ASAN', + 'name':'Rocksdb crash test with atomic flush under ASAN', 'oncall':'$ONCALL', 'timeout': 86400, 'steps': [ @@ -553,7 +553,7 @@ UBSAN_CRASH_TEST_COMMANDS="[ # UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test (atomic flush) under UBSAN', + 'name':'Rocksdb crash test with atomic flush under UBSAN', 'oncall':'$ONCALL', 'timeout': 86400, 'steps': [ From 0f48e56f96c9ef360a09cb3a76830c165c9ae392 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 7 Jun 2019 15:13:43 -0700 Subject: [PATCH 117/572] Revert to checking the upper bound on a per-key basis in BlockBasedTableIterator (#5428) Summary: PR #5111 reduced the number of key comparisons when iterating with upper/lower bounds; however, this caused a regression for MyRocks. Reverting to the previous behavior in BlockBasedTableIterator as a hotfix. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5428 Differential Revision: D15721038 Pulled By: ltamasi fbshipit-source-id: 5450106442f1763bccd17f6cfd648697f2ae8b6c --- db/db_iter.cc | 5 +++++ table/block_based/block_based_table_reader.cc | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/db/db_iter.cc b/db/db_iter.cc index 29a1a9eac1a..633724c5763 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,6 +467,8 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); + assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; @@ -859,6 +861,9 @@ void DBIter::PrevInternal() { return; } + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) >= 0); if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 0d7e3cf53a0..68213f04149 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2597,9 +2597,15 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = + // TODO: we should be able to use !data_block_within_upper_bound_ here + // instead of performing the comparison; however, the flag can apparently + // be out of sync with the comparison in some cases. This should be + // investigated. + const bool next_block_is_out_of_bound = read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + block_iter_points_to_real_block_ && + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { From b703a56e5cd722aaf169baa3e28127426776b6a9 Mon Sep 17 00:00:00 2001 From: anand76 Date: Fri, 7 Jun 2019 15:31:40 -0700 Subject: [PATCH 118/572] Potential fix for stress test failure due to "SST file ahead of WAL" error (#5412) Summary: I'm not able to prove it, but the stress test failure may be caused by the following sequence of events - 1. Crash db_stress while writing the log file. This should result in a corrupted WAL. 2. Run db_stress with recycle_log_file_num=1. Crash during recovery immediately after writing manifest and updating the current file. The old log from the previous run is left behind, but the memtable would have been flushed during recovery and the CF log number will point to the newer log 3. Run db_stress with recycle_log_file_num=0. During recovery, the old log file will be processed and the corruption will be detected. Since the CF has moved ahead, we get the "SST file is ahead of WAL" error Test - 1. stress_crash 2. make check Pull Request resolved: https://github.com/facebook/rocksdb/pull/5412 Differential Revision: D15699120 Pulled By: anand1976 fbshipit-source-id: 9092ce81e7c4a0b4b4e66560c23ea4812a4d9cbe --- db/db_impl/db_impl_compaction_flush.cc | 7 +++++++ db/db_impl/db_impl_open.cc | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index bd1a8e74f48..8cb37484cac 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -107,6 +107,13 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) { if (!s.ok()) { break; } + + if (immutable_db_options_.recycle_log_file_num > 0) { + s = log->Close(); + if (!s.ok()) { + break; + } + } } if (s.ok()) { s = directories_.GetWalDir()->Fsync(); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 69c9c4117d7..baa4fe707aa 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -555,12 +555,13 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, bool stop_replay_for_corruption = false; bool flushed = false; uint64_t corrupted_log_number = kMaxSequenceNumber; + uint64_t min_log_number = MinLogNumberToKeep(); for (auto log_number : log_numbers) { - if (log_number < versions_->min_log_number_to_keep_2pc()) { + if (log_number < min_log_number) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Skipping log #%" PRIu64 " since it is older than min log to keep #%" PRIu64, - log_number, versions_->min_log_number_to_keep_2pc()); + log_number, min_log_number); continue; } // The previous incarnation may not have written any MANIFEST From a16d0cc494ea8853b84c606efc04b61e33878fff Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 7 Jun 2019 19:34:48 -0700 Subject: [PATCH 119/572] Fix build errors regarding const qualifier being ignored on cast result type (#5432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This affects some TSAN builds: env/env_test.cc: In member function ‘virtual void rocksdb::EnvPosixTestWithParam_MultiRead_Test::TestBody()’: env/env_test.cc:1126:76: error: type qualifiers ignored on cast result type [-Werror=ignored-qualifiers] auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); ^ env/env_test.cc:1154:77: error: type qualifiers ignored on cast result type [-Werror=ignored-qualifiers] auto buf = NewAligned(kSectorSize * 8, static_cast(i*2 + 1)); ^ Pull Request resolved: https://github.com/facebook/rocksdb/pull/5432 Differential Revision: D15727277 Pulled By: ltamasi fbshipit-source-id: dc0e687b123e7c4d703ccc0c16b7167e07d1c9b0 --- env/env_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/env/env_test.cc b/env/env_test.cc index a2b6db5c475..6f225e37f67 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -1123,7 +1123,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) { #endif ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); for (size_t i = 0; i < kNumSectors; ++i) { - auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); + auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); Slice slice(data.get(), kSectorSize); ASSERT_OK(wfile->Append(slice)); } @@ -1151,7 +1151,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) { ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); for (size_t i = 0; i < reqs.size(); ++i) { - auto buf = NewAligned(kSectorSize * 8, static_cast(i*2 + 1)); + auto buf = NewAligned(kSectorSize * 8, static_cast(i*2 + 1)); ASSERT_OK(reqs[i].status); ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0); } From c292dc85402e0da7b816076ceb4b404e427d5ab4 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 10 Jun 2019 11:47:16 -0700 Subject: [PATCH 120/572] WritePrepared: reduce prepared_mutex_ overhead (#5420) Summary: The patch reduces the contention over prepared_mutex_ using these techniques: 1) Move ::RemovePrepared() to be called from the commit callback when we have two write queues. 2) Use two separate mutex for PreparedHeap, one prepared_mutex_ needed for ::RemovePrepared, and one ::push_pop_mutex() needed for ::AddPrepared(). Given that we call ::AddPrepared only from the first write queue and ::RemovePrepared mostly from the 2nd, this will result into each the two write queues not competing with each other over a single mutex. ::RemovePrepared might occasionally need to acquire ::push_pop_mutex() if ::erase() ends up with calling ::pop() 3) Acquire ::push_pop_mutex() on the first callback of the write queue and release it on the last. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5420 Differential Revision: D15741985 Pulled By: maysamyabandeh fbshipit-source-id: 84ce8016007e88bb6e10da5760ba1f0d26347735 --- db/db_impl/db_impl_write.cc | 19 ++- db/pre_release_callback.h | 6 +- db/write_callback_test.cc | 3 +- .../transactions/pessimistic_transaction.cc | 3 +- .../write_prepared_transaction_test.cc | 61 ++++++---- utilities/transactions/write_prepared_txn.cc | 26 ++-- .../transactions/write_prepared_txn_db.cc | 67 +++++++---- .../transactions/write_prepared_txn_db.h | 113 +++++++++++++----- .../transactions/write_unprepared_txn.cc | 4 +- .../transactions/write_unprepared_txn_db.cc | 4 +- .../transactions/write_unprepared_txn_db.h | 6 +- 11 files changed, 218 insertions(+), 94 deletions(-) diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 7ff2982d147..21b123c3a94 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -263,6 +263,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, size_t total_count = 0; size_t valid_batches = 0; size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { valid_batches += writer->batch_cnt; @@ -270,9 +271,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, total_count += WriteBatchInternal::Count(writer->batch); parallel = parallel && !writer->batch->HasMerge(); } - total_byte_size = WriteBatchInternal::AppendedByteSize( total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } } } // Note about seq_per_batch_: either disableWAL is set for the entire write @@ -336,6 +339,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // PreReleaseCallback is called after WAL write and before memtable write if (status.ok()) { SequenceNumber next_sequence = current_sequence; + size_t index = 0; // Note: the logic for advancing seq here must be consistent with the // logic in WriteBatchInternal::InsertInto(write_group...) as well as // with WriteBatchInternal::InsertInto(write_batch...) that is called on @@ -347,7 +351,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, writer->sequence = next_sequence; if (writer->pre_release_callback) { Status ws = writer->pre_release_callback->Callback( - writer->sequence, disable_memtable, writer->log_used); + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); if (!ws.ok()) { status = ws; break; @@ -675,11 +680,15 @@ Status DBImpl::WriteImplWALOnly( // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only + size_t pre_release_callback_cnt = 0; size_t total_byte_size = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { total_byte_size = WriteBatchInternal::AppendedByteSize( total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } } } @@ -758,11 +767,13 @@ Status DBImpl::WriteImplWALOnly( WriteStatusCheck(status); } if (status.ok()) { + size_t index = 0; for (auto* writer : write_group) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); Status ws = writer->pre_release_callback->Callback( - writer->sequence, disable_memtable, writer->log_used); + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); if (!ws.ok()) { status = ws; break; @@ -1121,7 +1132,7 @@ Status DBImpl::WriteRecoverableState() { // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB mutex_.Unlock(); status = recoverable_state_pre_release_callback_->Callback( - sub_batch_seq, !DISABLE_MEMTABLE, no_log_num); + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); mutex_.Lock(); } } diff --git a/db/pre_release_callback.h b/db/pre_release_callback.h index f91ef1b27ac..e4167904ff8 100644 --- a/db/pre_release_callback.h +++ b/db/pre_release_callback.h @@ -27,8 +27,12 @@ class PreReleaseCallback { // is_mem_disabled is currently used for debugging purposes to assert that // the callback is done from the right write queue. // If non-zero, log_number indicates the WAL log to which we wrote. + // index >= 0 specifies the order of callback in the same write thread. + // total > index specifies the total number of callbacks in the same write + // thread. Together with index, could be used to reduce the redundant + // operations among the callbacks. virtual Status Callback(SequenceNumber seq, bool is_mem_disabled, - uint64_t log_number) = 0; + uint64_t log_number, size_t index, size_t total) = 0; }; } // namespace rocksdb diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index b5e26a8a7f0..1ab97b04589 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -304,7 +304,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {} Status Callback(SequenceNumber last_seq, bool /*not used*/, - uint64_t) override { + uint64_t, size_t /*index*/, + size_t /*total*/) override { db_impl_->SetLastPublishedSequence(last_seq); return Status::OK(); } diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index ed7444894c7..1c0e2f06384 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -231,7 +231,8 @@ Status WriteCommittedTxn::PrepareInternal() { (void)two_write_queues_; // to silence unused private field warning } virtual Status Callback(SequenceNumber, bool is_mem_disabled, - uint64_t log_number) override { + uint64_t log_number, size_t /*index*/, + size_t /*total*/) override { #ifdef NDEBUG (void)is_mem_disabled; #endif diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 7c588f4ef69..e62b8344169 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -7,9 +7,9 @@ #include "utilities/transactions/transaction_test.h" -#include #include #include +#include #include #include #include @@ -55,25 +55,17 @@ TEST(PreparedHeap, BasicsTest) { heap.push(34l); // Test that old min is still on top ASSERT_EQ(14l, heap.top()); - heap.push(13l); - // Test that the new min will be on top - ASSERT_EQ(13l, heap.top()); - // Test that it is persistent - ASSERT_EQ(13l, heap.top()); heap.push(44l); heap.push(54l); heap.push(64l); heap.push(74l); heap.push(84l); // Test that old min is still on top - ASSERT_EQ(13l, heap.top()); + ASSERT_EQ(14l, heap.top()); heap.erase(24l); // Test that old min is still on top - ASSERT_EQ(13l, heap.top()); + ASSERT_EQ(14l, heap.top()); heap.erase(14l); - // Test that old min is still on top - ASSERT_EQ(13l, heap.top()); - heap.erase(13l); // Test that the new comes to the top after multiple erase ASSERT_EQ(34l, heap.top()); heap.erase(34l); @@ -3001,13 +2993,16 @@ TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) { ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value)); port::Mutex txn_mutex_; - // t1) Insert prepared entry, t2) commit other entires to advance max - // evicted sec and finish checking the existing prepared entires, t1) + // t1) Insert prepared entry, t2) commit other entries to advance max + // evicted sec and finish checking the existing prepared entries, t1) // AddPrepared, t2) update max_evicted_seq_ rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"AddPrepared::begin:pause", "AddPreparedBeforeMax::read_thread:start"}, - {"AdvanceMaxEvictedSeq::update_max:pause", "AddPrepared::begin:resume"}, - {"AddPrepared::end", "AdvanceMaxEvictedSeq::update_max:resume"}, + {"AddPreparedCallback::AddPrepared::begin:pause", + "AddPreparedBeforeMax::read_thread:start"}, + {"AdvanceMaxEvictedSeq::update_max:pause", + "AddPreparedCallback::AddPrepared::begin:resume"}, + {"AddPreparedCallback::AddPrepared::end", + "AdvanceMaxEvictedSeq::update_max:resume"}, }); SyncPoint::GetInstance()->EnableProcessing(); @@ -3061,20 +3056,36 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { ReOpen(); std::atomic snap = {nullptr}; std::atomic exp_prepare = {0}; + std::atomic snapshot_taken = {false}; // Value is synchronized via snap PinnableSlice value; // Take a snapshot after publish and before RemovePrepared:Start + auto snap_callback = [&]() { + ASSERT_EQ(nullptr, snap.load()); + snap.store(db->GetSnapshot()); + ReadOptions roptions; + roptions.snapshot = snap.load(); + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value); + ASSERT_OK(s); + snapshot_taken.store(true); + }; auto callback = [&](void* param) { SequenceNumber prep_seq = *((SequenceNumber*)param); if (prep_seq == exp_prepare.load()) { // only for write_thread - ASSERT_EQ(nullptr, snap.load()); - snap.store(db->GetSnapshot()); - ReadOptions roptions; - roptions.snapshot = snap.load(); - auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value); - ASSERT_OK(s); + // We need to spawn a thread to avoid deadlock since getting a + // snpashot might end up calling AdvanceSeqByOne which needs joining + // the write queue. + auto t = rocksdb::port::Thread(snap_callback); + t.detach(); + TEST_SYNC_POINT("callback:end"); } }; + // Wait for the first snapshot be taken in GetSnapshotInternal. Although + // it might be updated before GetSnapshotInternal finishes but this should + // cover most of the cases. + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"}, + }); SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback); SyncPoint::GetInstance()->EnableProcessing(); // Thread to cause frequent evictions @@ -3098,9 +3109,15 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { // Let an eviction to kick in std::this_thread::yield(); + snapshot_taken.store(false); exp_prepare.store(txn->GetId()); ASSERT_OK(txn->Commit()); delete txn; + // Wait for the snapshot taking that is triggered by + // RemovePrepared:Start callback + while (!snapshot_taken) { + std::this_thread::yield(); + } // Read with the snapshot taken before delayed_prepared_ cleanup ReadOptions roptions; diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index f55615063e5..f4c21d4769e 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -169,12 +169,15 @@ Status WritePreparedTxn::CommitInternal() { assert(!s.ok() || seq_used != kMaxSequenceNumber); const SequenceNumber commit_batch_seq = seq_used; if (LIKELY(do_one_write || !s.ok())) { - if (LIKELY(s.ok())) { - // Note RemovePrepared should be called after WriteImpl that publishsed + if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues && + s.ok())) { + // Note: RemovePrepared should be called after WriteImpl that publishsed // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); - } + } // else RemovePrepared is called from within PreReleaseCallback if (UNLIKELY(!do_one_write)) { + assert(!s.ok()); + // Cleanup the prepared entry we added with add_prepared_callback wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); } return s; @@ -199,10 +202,14 @@ Status WritePreparedTxn::CommitInternal() { NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, &update_commit_map_with_aux_batch); assert(!s.ok() || seq_used != kMaxSequenceNumber); - // Note RemovePrepared should be called after WriteImpl that publishsed the - // seq. Otherwise SmallestUnCommittedSeq optimization breaks. - wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); - wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues)) { + if (s.ok()) { + // Note: RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); + } + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } // else RemovePrepared is called from within PreReleaseCallback return s; } @@ -348,6 +355,7 @@ Status WritePreparedTxn::RollbackInternal() { return s; } if (do_one_write) { + assert(!db_impl_->immutable_db_options().two_write_queues); wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); return s; } // else do the 2nd write for commit @@ -370,9 +378,13 @@ Status WritePreparedTxn::RollbackInternal() { ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, "RollbackInternal (status=%s) commit: %" PRIu64, s.ToString().c_str(), GetId()); + // TODO(lth): For WriteUnPrepared that rollback is called frequently, + // RemovePrepared could be moved to the callback to reduce lock contention. if (s.ok()) { wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); } + // Note: RemovePrepared for prepared batch is called from within + // PreReleaseCallback wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH); return s; diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 8e08d074134..96e1aa7a7ba 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -7,8 +7,8 @@ #include "utilities/transactions/write_prepared_txn_db.h" -#include #include +#include #include #include #include @@ -61,8 +61,8 @@ Status WritePreparedTxnDB::Initialize( explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) : db_(db) {} Status Callback(SequenceNumber commit_seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { assert(!is_mem_disabled); db_->AddCommitted(commit_seq, commit_seq); return Status::OK(); @@ -211,9 +211,7 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, &update_commit_map_with_prepare); assert(!s.ok() || seq_used != kMaxSequenceNumber); - // Note RemovePrepared should be called after WriteImpl that publishsed the - // seq. Otherwise SmallestUnCommittedSeq optimization breaks. - RemovePrepared(prepare_seq, batch_cnt); + // Note: RemovePrepared is called from within PreReleaseCallback return s; } @@ -389,8 +387,8 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) { new std::atomic[COMMIT_CACHE_SIZE] {}); } -void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) { - prepared_mutex_.AssertHeld(); +void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max, + bool locked) { // When max_evicted_seq_ advances, move older entries from prepared_txns_ // to delayed_prepared_. This guarantees that if a seq is lower than max, // then it is not in prepared_txns_ and save an expensive, synchronized @@ -401,25 +399,42 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) { "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64, prepared_txns_.empty(), prepared_txns_.empty() ? 0 : prepared_txns_.top()); - while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { - auto to_be_popped = prepared_txns_.top(); - delayed_prepared_.insert(to_be_popped); - ROCKS_LOG_WARN(info_log_, - "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 - " new_max=%" PRIu64, - static_cast(delayed_prepared_.size()), - to_be_popped, new_max); - prepared_txns_.pop(); - delayed_prepared_empty_.store(false, std::memory_order_release); + const SequenceNumber prepared_top = prepared_txns_.top(); + const bool empty = prepared_top == kMaxSequenceNumber; + // Preliminary check to avoid the synchronization cost + if (!empty && prepared_top <= new_max) { + if (locked) { + // Needed to avoid double locking in pop(). + prepared_txns_.push_pop_mutex()->Unlock(); + } + WriteLock wl(&prepared_mutex_); + // Need to fetch fresh values of ::top after mutex is acquired + while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { + auto to_be_popped = prepared_txns_.top(); + delayed_prepared_.insert(to_be_popped); + ROCKS_LOG_WARN(info_log_, + "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 + " new_max=%" PRIu64, + static_cast(delayed_prepared_.size()), + to_be_popped, new_max); + prepared_txns_.pop(); + delayed_prepared_empty_.store(false, std::memory_order_release); + } + if (locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } } } -void WritePreparedTxnDB::AddPrepared(uint64_t seq) { +void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) { ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64, seq, max_evicted_seq_.load()); TEST_SYNC_POINT("AddPrepared::begin:pause"); TEST_SYNC_POINT("AddPrepared::begin:resume"); - WriteLock wl(&prepared_mutex_); + if (!locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + prepared_txns_.push_pop_mutex()->AssertHeld(); prepared_txns_.push(seq); auto new_max = future_max_evicted_seq_.load(); if (UNLIKELY(seq <= new_max)) { @@ -429,7 +444,10 @@ void WritePreparedTxnDB::AddPrepared(uint64_t seq) { "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64 " <= %" PRIu64, seq, new_max); - CheckPreparedAgainstMax(new_max); + CheckPreparedAgainstMax(new_max, true /*locked*/); + } + if (!locked) { + prepared_txns_.push_pop_mutex()->Unlock(); } TEST_SYNC_POINT("AddPrepared::end"); } @@ -582,10 +600,7 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, std::memory_order_relaxed)) { }; - { - WriteLock wl(&prepared_mutex_); - CheckPreparedAgainstMax(new_max); - } + CheckPreparedAgainstMax(new_max, false /*locked*/); // With each change to max_evicted_seq_ fetch the live snapshots behind it. // We use max as the version of snapshots to identify how fresh are the @@ -641,6 +656,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal( // than the smallest uncommitted seq when the snapshot was taken. auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq(); SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first"); assert(snap_impl); SequenceNumber snap_seq = snap_impl->GetSequenceNumber(); // Note: Check against future_max_evicted_seq_ (in contrast with @@ -679,6 +695,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal( db_impl_->immutable_db_options().info_log, "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64, snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end"); return snap_impl; } diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 876279cba23..acf2b97a99d 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -324,10 +324,11 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // Add the transaction with prepare sequence seq to the prepared list. // Note: must be called serially with increasing seq on each call. - void AddPrepared(uint64_t seq); + // locked is true if prepared_mutex_ is already locked. + void AddPrepared(uint64_t seq, bool locked = false); // Check if any of the prepared txns are less than new max_evicted_seq_. Must // be called with prepared_mutex_ write locked. - void CheckPreparedAgainstMax(SequenceNumber new_max); + void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked); // Remove the transaction with prepare sequence seq from the prepared list void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1); // Add the transaction with prepare sequence prepare_seq and commit sequence @@ -461,6 +462,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { std::memory_order order = std::memory_order_relaxed); private: + friend class AddPreparedCallback; friend class PreparedHeap_BasicsTest_Test; friend class PreparedHeap_Concurrent_Test; friend class PreparedHeap_EmptyAtTheEnd_Test; @@ -506,10 +508,15 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // A heap with the amortized O(1) complexity for erase. It uses one extra heap // to keep track of erased entries that are not yet on top of the main heap. class PreparedHeap { + // The mutex is required for push and pop from PreparedHeap. ::erase will + // use external synchronization via prepared_mutex_. + port::Mutex push_pop_mutex_; + // TODO(myabandeh): replace it with deque std::priority_queue, std::greater> heap_; std::priority_queue, std::greater> erased_heap_; + std::atomic heap_top_ = {kMaxSequenceNumber}; // True when testing crash recovery bool TEST_CRASH_ = false; friend class WritePreparedTxnDB; @@ -521,10 +528,19 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { assert(erased_heap_.empty()); } } - bool empty() { return heap_.empty(); } - uint64_t top() { return heap_.top(); } - void push(uint64_t v) { heap_.push(v); } - void pop() { + port::Mutex* push_pop_mutex() { return &push_pop_mutex_; } + + inline bool empty() { return top() == kMaxSequenceNumber; } + // Returns kMaxSequenceNumber if empty() and the smallest otherwise. + inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); } + inline void push(uint64_t v) { + heap_.push(v); + heap_top_.store(heap_.top(), std::memory_order_release); + } + void pop(bool locked = false) { + if (!locked) { + push_pop_mutex()->Lock(); + } heap_.pop(); while (!heap_.empty() && !erased_heap_.empty() && // heap_.top() > erased_heap_.top() could happen if we have erased @@ -543,15 +559,23 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { while (heap_.empty() && !erased_heap_.empty()) { erased_heap_.pop(); } + heap_top_.store(!heap_.empty() ? heap_.top() : kMaxSequenceNumber, + std::memory_order_release); + if (!locked) { + push_pop_mutex()->Unlock(); + } } + // Concurrrent calls needs external synchronization. It is safe to be called + // concurrent to push and pop though. void erase(uint64_t seq) { if (!heap_.empty()) { - if (seq < heap_.top()) { + auto top_seq = top(); + if (seq < top_seq) { // Already popped, ignore it. - } else if (heap_.top() == seq) { + } else if (top_seq == seq) { pop(); assert(heap_.empty() || heap_.top() != seq); - } else { // (heap_.top() > seq) + } else { // top() > seq // Down the heap, remember to pop it later erased_heap_.push(seq); } @@ -596,27 +620,37 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // written in two steps, we also update prepared_txns_ at the first step // (via the same mechanism) so that their uncommitted data is reflected in // SmallestUnCommittedSeq. - ReadLock rl(&prepared_mutex_); - // Since we are holding the mutex, and GetLatestSequenceNumber is updated - // after prepared_txns_ are, the value of GetLatestSequenceNumber would - // reflect any uncommitted data that is not added to prepared_txns_ yet. - // Otherwise, if there is no concurrent txn, this value simply reflects that - // latest value in the memtable. - if (!delayed_prepared_.empty()) { - assert(!delayed_prepared_empty_.load()); - return *delayed_prepared_.begin(); + if (!delayed_prepared_empty_.load()) { + ReadLock rl(&prepared_mutex_); + if (!delayed_prepared_.empty()) { + return *delayed_prepared_.begin(); + } } - if (prepared_txns_.empty()) { - return db_impl_->GetLatestSequenceNumber() + 1; + // This must be called before calling ::top. This is because the concurrent + // thread would call ::RemovePrepared before updating + // GetLatestSequenceNumber(). Reading then in opposite order here guarantees + // that the ::top that we read would be lower the ::top if we had otherwise + // update/read them atomically. + auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1; + auto min_prepare = prepared_txns_.top(); + bool empty = min_prepare == kMaxSequenceNumber; + if (empty) { + // Since GetLatestSequenceNumber is updated + // after prepared_txns_ are, the value of GetLatestSequenceNumber would + // reflect any uncommitted data that is not added to prepared_txns_ yet. + // Otherwise, if there is no concurrent txn, this value simply reflects + // that latest value in the memtable. + return next_prepare; } else { - return std::min(prepared_txns_.top(), - db_impl_->GetLatestSequenceNumber() + 1); + return std::min(min_prepare, next_prepare); } } + // Enhance the snapshot object by recording in it the smallest uncommitted seq inline void EnhanceSnapshot(SnapshotImpl* snapshot, SequenceNumber min_uncommitted) { assert(snapshot); + assert(min_uncommitted <= snapshot->number_ + 1); snapshot->min_uncommitted_ = min_uncommitted; } @@ -778,12 +812,28 @@ class AddPreparedCallback : public PreReleaseCallback { } virtual Status Callback(SequenceNumber prepare_seq, bool is_mem_disabled __attribute__((__unused__)), - uint64_t log_number) override { + uint64_t log_number, size_t index, + size_t total) override { + assert(index < total); + // To reduce the cost of lock acquisition competing with the concurrent + // prepare requests, lock on the first callback and unlock on the last. + const bool do_lock = !two_write_queues_ || index == 0; + const bool do_unlock = !two_write_queues_ || index + 1 == total; // Always Prepare from the main queue assert(!two_write_queues_ || !is_mem_disabled); // implies the 1st queue + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume"); + if (do_lock) { + db_->prepared_txns_.push_pop_mutex()->Lock(); + } + const bool kLocked = true; for (size_t i = 0; i < sub_batch_cnt_; i++) { - db_->AddPrepared(prepare_seq + i); + db_->AddPrepared(prepare_seq + i, kLocked); } + if (do_unlock) { + db_->prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end"); if (first_prepare_batch_) { assert(log_number != 0); db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection( @@ -826,7 +876,8 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { virtual Status Callback(SequenceNumber commit_seq, bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + uint64_t, size_t /*index*/, + size_t /*total*/) override { // Always commit from the 2nd queue assert(!db_impl_->immutable_db_options().two_write_queues || is_mem_disabled); @@ -863,6 +914,14 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { // publish sequence numbers will be in order, i.e., once a seq is // published all the seq prior to that are also publishable. db_impl_->SetLastPublishedSequence(last_commit_seq); + // Note RemovePrepared should be called after publishing the seq. + // Otherwise SmallestUnCommittedSeq optimization breaks. + if (prep_seq_ != kMaxSequenceNumber) { + db_->RemovePrepared(prep_seq_, prep_batch_cnt_); + } // else there was no prepare phase + if (includes_aux_batch_) { + db_->RemovePrepared(aux_seq_, aux_batch_cnt_); + } } // else SequenceNumber that is updated as part of the write already does the // publishing @@ -907,8 +966,8 @@ class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback { assert(prep_batch_cnt_ > 0); } - Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, - uint64_t) override { + Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t, + size_t /*index*/, size_t /*total*/) override { // Always commit from the 2nd queue assert(is_mem_disabled); // implies the 2nd queue assert(db_impl_->immutable_db_options().two_write_queues); diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 73e9a8837a0..a1fe213ddd3 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -319,8 +319,8 @@ Status WriteUnpreparedTxn::CommitInternal() { explicit PublishSeqPreReleaseCallback(DBImpl* db_impl) : db_impl_(db_impl) {} Status Callback(SequenceNumber seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { assert(is_mem_disabled); assert(db_impl_->immutable_db_options().two_write_queues); db_impl_->SetLastPublishedSequence(seq); diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index ea655f88e3c..0c94183947f 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -185,8 +185,8 @@ Status WriteUnpreparedTxnDB::Initialize( explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) : db_(db) {} Status Callback(SequenceNumber commit_seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { assert(!is_mem_disabled); db_->AddCommitted(commit_seq, commit_seq); return Status::OK(); diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h index fab8ce8263d..6405ba68381 100644 --- a/utilities/transactions/write_unprepared_txn_db.h +++ b/utilities/transactions/write_unprepared_txn_db.h @@ -57,7 +57,8 @@ class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { virtual Status Callback(SequenceNumber commit_seq, bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + uint64_t, size_t /*index*/, + size_t /*total*/) override { const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) ? commit_seq : commit_seq + data_batch_cnt_ - 1; @@ -121,7 +122,8 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback { virtual Status Callback(SequenceNumber commit_seq, bool is_mem_disabled __attribute__((__unused__)), - uint64_t) override { + uint64_t, size_t /*index*/, + size_t /*total*/) override { assert(is_mem_disabled); // implies the 2nd queue const uint64_t last_commit_seq = commit_seq; db_->AddCommitted(rollback_seq_, last_commit_seq); From 6ce5580882bda5791bec61b033e03a452a7a8483 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 10 Jun 2019 12:53:56 -0700 Subject: [PATCH 121/572] Improve memtable earliest seqno assignment for secondary instance (#5413) Summary: In regular RocksDB instance, `MemTable::earliest_seqno_` is "db sequence number at the time of creation". However, we cannot use the db sequence number to set the value of `MemTable::earliest_seqno_` for secondary instance, i.e. `DBImplSecondary` due to the logic of MANIFEST and WAL replay. When replaying the log files of the primary, the secondary instance first replays MANIFEST and updates the db sequence number if necessary. Next, the secondary replays WAL files, creates new memtables if necessary and inserts key-value pairs into memtables. The following can occur when the db has two or more column families. Assume the db has column family "default" and "cf1". At a certain in time, both "default" and "cf1" have data in memtables. 1. Primary triggers a flush and flushes "cf1". "default" is **not** flushed. 2. Secondary replays the MANIFEST updates its db sequence number to the latest value learned from the MANIFEST. 3. Secondary starts to replay WAL that contains the writes to "default". It is possible that the write batches' sequence numbers are smaller than the db sequence number. In this case, these write batches will be skipped, and these updates will not be visible to reader until "default" is later flushed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5413 Differential Revision: D15637407 Pulled By: riversand963 fbshipit-source-id: 3de3fe35cfc6f1b9f844f3f926f0df29717b6580 --- HISTORY.md | 1 + db/db_impl/db_impl_secondary.cc | 36 ++++++++++++++++++++++++--------- db/db_impl/db_secondary_test.cc | 7 +++++++ 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c88b436e40d..ad6c370b5a0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,6 +27,7 @@ ### Bug Fixes * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. +* Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 827d99929a9..eb8c4c98738 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -199,16 +199,8 @@ Status DBImplSecondary::RecoverLogFiles( record.size(), Status::Corruption("log record too small")); continue; } - SequenceNumber seq = versions_->LastSequence(); WriteBatchInternal::SetContents(&batch, record); SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); - // If the write batch's sequence number is smaller than the last sequence - // number of the db, then we should skip this write batch because its - // data must reside in an SST that has already been added in the prior - // MANIFEST replay. - if (seq_of_batch < seq) { - continue; - } std::vector column_family_ids; status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); if (status.ok()) { @@ -221,6 +213,17 @@ Status DBImplSecondary::RecoverLogFiles( if (cfds_changed->count(cfd) == 0) { cfds_changed->insert(cfd); } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } auto curr_log_num = port::kMaxUint64; if (cfd_to_current_log_.count(cfd) > 0) { curr_log_num = cfd_to_current_log_[cfd]; @@ -233,7 +236,7 @@ Status DBImplSecondary::RecoverLogFiles( const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); MemTable* new_mem = - cfd->ConstructNewMemtable(mutable_cf_options, seq); + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); cfd->mem()->SetNextLogNumber(log_number); cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); new_mem->Ref(); @@ -452,6 +455,21 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { InstrumentedMutexLock lock_guard(&mutex_); s = static_cast(versions_.get()) ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Level summary: %s\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + // list wal_dir to discover new WALs and apply new changes to the secondary // instance if (s.ok()) { diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index 5b375422f02..c9aaa361191 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -576,6 +576,11 @@ TEST_F(DBSecondaryTest, SwitchWAL) { TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); const std::string kCFName1 = "pikachu"; Options options; options.env = env_; @@ -629,8 +634,10 @@ TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); ASSERT_OK( Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + TEST_SYNC_POINT("DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); } } From 63ace8ef0e644ab3384b0a19f0235cd6596f70c1 Mon Sep 17 00:00:00 2001 From: anand76 Date: Mon, 10 Jun 2019 13:28:18 -0700 Subject: [PATCH 122/572] Reuse data block iterator in BlockBasedTableReader::MultiGet() (#5314) Summary: Instead of creating a new DataBlockIterator for every key in a MultiGet batch, reuse it if the next key is in the same block. This results in a small 1-2% cpu improvement. TEST_TMPDIR=/dev/shm/multiget numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Without the change - multireadrandom : 3.066 micros/op 326122 ops/sec; (29375968 of 29375968 found) With the change - multireadrandom : 3.003 micros/op 332945 ops/sec; (29983968 of 29983968 found) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5314 Differential Revision: D15742108 Pulled By: anand1976 fbshipit-source-id: 220fb0b8eea9a0d602ddeb371528f7af7936d771 --- table/block_based/block.h | 13 ++++++ table/block_based/block_based_table_reader.cc | 43 ++++++++++++++++--- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/table/block_based/block.h b/table/block_based/block.h index 3c54389b08a..2bb577d33bd 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -236,6 +236,7 @@ class BlockIter : public InternalIteratorBase { restart_index_ = num_restarts_; global_seqno_ = global_seqno; block_contents_pinned_ = block_contents_pinned; + cache_handle_ = nullptr; } // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do @@ -285,6 +286,10 @@ class BlockIter : public InternalIteratorBase { return static_cast(value_.data() - data_); } + void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } + + Cache::Handle* cache_handle() { return cache_handle_; } + protected: // Note: The type could be changed to InternalKeyComparator but we see a weird // performance drop by that. @@ -307,6 +312,14 @@ class BlockIter : public InternalIteratorBase { bool block_contents_pinned_; SequenceNumber global_seqno_; + private: + // Store the cache handle, if the block is cached. We need this since the + // only other place the handle is stored is as an argument to the Cleanable + // function callback, which is hard to retrieve. When multiple value + // PinnableSlices reference the block, they need the cache handle in order + // to bump up the ref count + Cache::Handle* cache_handle_; + public: // Return the offset in data_ just past the end of the current entry. inline uint32_t NextEntryOffset() const { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 68213f04149..a8e4e1d40db 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -129,6 +129,14 @@ void ForceReleaseCachedEntry(void* arg, void* h) { cache->Release(handle, true /* force_erase */); } +// Release the cached entry and decrement its ref count. +// Do not force erase +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle, false /* force_erase */); +} + // For hash based index, return true if prefix_extractor and // prefix_extractor_block mismatch, false otherwise. This flag will be used // as total_order_seek via NewIndexIterator @@ -2073,6 +2081,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( cache_handle); } } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); } block.TransferTo(iter); @@ -2933,6 +2943,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, iiter_unique_ptr.reset(iiter); } + DataBlockIter biter; + uint64_t offset = std::numeric_limits::max(); for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); ++miter) { Status s; @@ -2941,10 +2953,15 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, bool matched = false; // if such user key matched a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - DataBlockIter biter; - NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, - true /* key_includes_seq */, get_context); + bool reusing_block = true; + if (iiter->value().offset() != offset) { + offset = iiter->value().offset(); + biter.Invalidate(Status::OK()); + NewDataBlockIterator( + read_options, iiter->value(), &biter, BlockType::kData, false, + true /* key_includes_seq */, get_context); + reusing_block = false; + } if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { @@ -2971,13 +2988,27 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, // Call the *saver function on each entry/block until it returns false for (; biter.Valid(); biter.Next()) { ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + if (!ParseInternalKey(biter.key(), &parsed_key)) { s = Status::Corruption(Slice()); } + if (biter.IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter.cache_handle() != nullptr); + block_cache->Ref(biter.cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter.cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = &biter; + } + } if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { + parsed_key, biter.value(), &matched, value_pinner)) { done = true; break; } From 5efa0d6b0df1f3aea2ea8720c48c2b918b47ead1 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 10 Jun 2019 15:30:05 -0700 Subject: [PATCH 123/572] Create a BlockCacheLookupContext to enable fine-grained block cache tracing. (#5421) Summary: BlockCacheLookupContext only contains the caller for now. We will trace block accesses at five places: 1. BlockBasedTable::GetFilter. 2. BlockBasedTable::GetUncompressedDict. 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, and range deletion block.) 4. BlockBasedTable::Get. (To trace the referenced key and whether the referenced key exists in a fetched data block.) 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the referenced key exists in a fetched data block.) We create the context at: 1. BlockBasedTable::Get. (kUserGet) 2. BlockBasedTable::MultiGet. (kUserMGet) 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or external SST ingestion calls this function.) 4. BlockBasedTable::Open. (kPrefetch) 5. Index/Filter::CacheDependencies. (kPrefetch) 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or kUserApproximateSize). I loaded 1 million key-value pairs into the database and ran the readrandom benchmark with a single thread. I gave the block cache 10 GB to make sure all reads hit the block cache after warmup. The throughput is comparable. Throughput of this PR: 231334 ops/s. Throughput of the master branch: 238428 ops/s. Experiment setup: RocksDB: version 6.2 Date: Mon Jun 10 10:42:51 2019 CPU: 24 * Intel Core Processor (Skylake) CPUCache: 16384 KB Keys: 20 bytes each Values: 100 bytes each (100 bytes after compression) Entries: 1000000 Prefix: 20 bytes Keys per prefix: 0 RawSize: 114.4 MB (estimated) FileSize: 114.4 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: NoCompression Compression sampling rate: 0 Memtablerep: skip_list Perf Level: 1 Load command: ./db_bench --benchmarks="fillseq" --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 Run command: ./db_bench --benchmarks="readrandom,stats" --use_existing_db --threads=1 --duration=120 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 --duration=120 TODOs: 1. Create a caller for external SST file ingestion and differentiate the callers for iterator. 2. Integrate tracer to trace block cache accesses. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5421 Differential Revision: D15704258 Pulled By: HaoyuHuang fbshipit-source-id: 4aa8a55f8cb1576ffb367bfa3186a91d8f06d93a --- db/compaction/compaction_job.cc | 3 +- db/db_impl/db_impl.cc | 4 +- db/version_set.cc | 21 +- db/version_set.h | 7 +- table/block_based/block_based_filter_block.cc | 6 +- table/block_based/block_based_filter_block.h | 23 +- .../block_based_filter_block_test.cc | 200 +++++++++--- table/block_based/block_based_table_reader.cc | 300 +++++++++++------- table/block_based/block_based_table_reader.h | 89 +++--- table/block_based/filter_block.h | 34 +- table/block_based/full_filter_block.cc | 23 +- table/block_based/full_filter_block.h | 59 ++-- table/block_based/full_filter_block_test.cc | 64 +++- table/block_based/partitioned_filter_block.cc | 37 ++- table/block_based/partitioned_filter_block.h | 30 +- .../partitioned_filter_block_test.cc | 18 +- table/cuckoo/cuckoo_table_reader.h | 5 +- table/mock_table.h | 7 +- table/plain/plain_table_reader.cc | 3 +- table/plain/plain_table_reader.h | 3 +- table/table_reader.h | 3 +- trace_replay/block_cache_tracer.h | 30 +- 22 files changed, 634 insertions(+), 335 deletions(-) diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index ca8575a0dc9..65efedad5b4 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() { // to the index block and may incur I/O cost in the process. Unlock db // mutex to reduce contention db_mutex_->Unlock(); - uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1); + uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1, + /*for_compaction*/ true); db_mutex_->Lock(); ranges.emplace_back(a, b, size); sum += size; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index bb6ec7db4c5..b1a828f9f0e 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2717,7 +2717,9 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) { - sizes[i] += versions_->ApproximateSize(v, k1.Encode(), k2.Encode()); + sizes[i] += versions_->ApproximateSize( + v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, + /*for_compaction=*/false); } if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; diff --git a/db/version_set.cc b/db/version_set.cc index 96bf22e57b4..8895879bfbf 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4827,7 +4827,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, const Slice& end, int start_level, - int end_level) { + int end_level, bool for_compaction) { // pre-condition assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); @@ -4848,7 +4848,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, if (!level) { // level 0 data is sorted order, handle the use case explicitly - size += ApproximateSizeLevel0(v, files_brief, start, end); + size += ApproximateSizeLevel0(v, files_brief, start, end, for_compaction); continue; } @@ -4865,7 +4865,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, // inferred from the sorted order for (uint64_t i = idx_start; i < files_brief.num_files; i++) { uint64_t val; - val = ApproximateSize(v, files_brief.files[i], end); + val = ApproximateSize(v, files_brief.files[i], end, for_compaction); if (!val) { // the files after this will not have the range break; @@ -4876,7 +4876,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, if (i == idx_start) { // subtract the bytes needed to be scanned to get to the starting // key - val = ApproximateSize(v, files_brief.files[i], start); + val = ApproximateSize(v, files_brief.files[i], start, for_compaction); assert(size >= val); size -= val; } @@ -4889,13 +4889,16 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, uint64_t VersionSet::ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, const Slice& key_start, - const Slice& key_end) { + const Slice& key_end, + bool for_compaction) { // level 0 files are not in sorted order, we need to iterate through // the list to compute the total bytes that require scanning uint64_t size = 0; for (size_t i = 0; i < files_brief.num_files; i++) { - const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start); - const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end); + const uint64_t start = + ApproximateSize(v, files_brief.files[i], key_start, for_compaction); + const uint64_t end = + ApproximateSize(v, files_brief.files[i], key_end, for_compaction); assert(end >= start); size += end - start; } @@ -4903,7 +4906,7 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v, } uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key) { + const Slice& key, bool for_compaction) { // pre-condition assert(v); @@ -4923,7 +4926,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, *f.file_metadata, nullptr /* range_del_agg */, v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr); if (table_reader_ptr != nullptr) { - result = table_reader_ptr->ApproximateOffsetOf(key); + result = table_reader_ptr->ApproximateOffsetOf(key, for_compaction); } delete iter; } diff --git a/db/version_set.h b/db/version_set.h index dc9e759655e..8a43b982366 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -982,7 +982,7 @@ class VersionSet { // in levels [start_level, end_level). If end_level == 0 it will search // through all non-empty levels uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, - int start_level = 0, int end_level = -1); + int start_level, int end_level, bool for_compaction); // Return the size of the current manifest file uint64_t manifest_file_size() const { return manifest_file_size_; } @@ -1032,10 +1032,11 @@ class VersionSet { // ApproximateSize helper uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, - const Slice& start, const Slice& end); + const Slice& start, const Slice& end, + bool for_compaction); uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key); + const Slice& key, bool for_compaction); // Save current contents to *log Status WriteSnapshot(log::Writer* log); diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc index fb366b5d316..e5a32e4635f 100644 --- a/table/block_based/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -187,7 +187,8 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( bool BlockBasedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /* prefix_extractor */, uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + const Slice* const /*const_ikey_ptr*/, + BlockCacheLookupContext* /*context*/) { assert(block_offset != kNotValid); if (!whole_key_filtering_) { return true; @@ -198,7 +199,8 @@ bool BlockBasedFilterBlockReader::KeyMayMatch( bool BlockBasedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + const Slice* const /*const_ikey_ptr*/, + BlockCacheLookupContext* /*context*/) { assert(block_offset != kNotValid); return MayMatch(prefix, block_offset); } diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h index 74a2285e1e9..cd86ff5c8a5 100644 --- a/table/block_based/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -82,17 +82,18 @@ class BlockBasedFilterBlockReader : public FilterBlockReader { const BlockBasedTableOptions& table_opt, bool whole_key_filtering, BlockContents&& contents, Statistics* statistics); - virtual bool IsBlockBased() override { return true; } - - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual size_t ApproximateMemoryUsage() const override; + bool IsBlockBased() override { return true; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + size_t ApproximateMemoryUsage() const override; // convert this object to a human readable form std::string ToString() const override; diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index e0ca57f1c51..220888dd2fb 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -57,8 +57,12 @@ TEST_F(FilterBlockTest, EmptyBuilder) { ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } TEST_F(FilterBlockTest, SingleChunk) { @@ -76,13 +80,27 @@ TEST_F(FilterBlockTest, SingleChunk) { BlockContents block(builder.Finish()); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } TEST_F(FilterBlockTest, MultiChunk) { @@ -110,28 +128,60 @@ TEST_F(FilterBlockTest, MultiChunk) { std::move(block), nullptr); // Check first filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000)); - ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0})); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0})); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check second filter - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100)); + ASSERT_TRUE(reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check third filter (empty) - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check last filter - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000)); + ASSERT_TRUE(reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } // Test for block based filter block @@ -154,8 +204,12 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); FilterBlockReader* reader = new BlockBasedFilterBlockReader( nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); delete builder; delete reader; @@ -175,13 +229,27 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { BlockContents block(builder->Finish()); FilterBlockReader* reader = new BlockBasedFilterBlockReader( nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100)); - ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); delete builder; delete reader; @@ -213,28 +281,60 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { nullptr, table_options_, true, std::move(block), nullptr); // Check first filter - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000)); - ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0})); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0})); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check second filter - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100)); + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check third filter (empty) - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); // Check last filter - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000)); - ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000)); + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); delete builder; delete reader; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a8e4e1d40db..d1beafed68b 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -178,6 +178,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block); const BlockBasedTable* table() const { return table_; } @@ -211,6 +212,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { Status GetOrReadIndexBlock(const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block) const; size_t ApproximateIndexBlockMemoryUsage() const { @@ -228,6 +230,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block) { PERF_TIMER_GUARD(read_index_block_nanos); @@ -241,13 +244,14 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->footer.index_handle(), UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, - get_context); + get_context, lookup_context); return s; } Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block) const { assert(index_block != nullptr); @@ -256,8 +260,8 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( return Status::OK(); } - return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, read_options, - get_context, index_block); + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, + get_context, lookup_context, index_block); } // Index that allows binary search lookup in a two-level index structure. @@ -269,7 +273,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader) { + bool prefetch, bool pin, IndexReader** index_reader, + BlockCacheLookupContext* lookup_context) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -277,8 +282,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { CachableEntry index_block; if (prefetch || !use_cache) { - const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), - nullptr /* get_context */, &index_block); + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + /*get_context=*/nullptr, lookup_context, &index_block); if (!s.ok()) { return s; } @@ -296,10 +302,11 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // return a two-level iterator: first level is on the partition index InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, - IndexBlockIter* iter, GetContext* get_context) override { + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(read_options, get_context, &index_block); + const Status s = GetOrReadIndexBlock(read_options, get_context, + lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -352,6 +359,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { void CacheDependencies(bool pin) override { // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; auto rep = table()->rep_; IndexBlockIter biter; BlockHandle handle; @@ -359,7 +367,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { CachableEntry index_block; Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */, - &index_block); + &lookup_context, &index_block); if (!s.ok()) { ROCKS_LOG_WARN(rep->ioptions.info_log, "Error retrieving top-level index block while trying to " @@ -408,7 +416,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, BlockType::kIndex, nullptr /* get_context */); + &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context); assert(s.ok() || block.GetValue() == nullptr); if (s.ok() && block.GetValue() != nullptr) { @@ -451,7 +459,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader) { + bool prefetch, bool pin, IndexReader** index_reader, + BlockCacheLookupContext* lookup_context) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -459,8 +468,9 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { CachableEntry index_block; if (prefetch || !use_cache) { - const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), - nullptr /* get_context */, &index_block); + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + /*get_context=*/nullptr, lookup_context, &index_block); if (!s.ok()) { return s; } @@ -477,10 +487,11 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, - IndexBlockIter* iter, GetContext* get_context) override { + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(read_options, get_context, &index_block); + const Status s = GetOrReadIndexBlock(read_options, get_context, + lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -526,7 +537,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_index_iter, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader) { + bool prefetch, bool pin, IndexReader** index_reader, + BlockCacheLookupContext* lookup_context) { assert(table != nullptr); assert(index_reader != nullptr); assert(!pin || prefetch); @@ -536,8 +548,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { CachableEntry index_block; if (prefetch || !use_cache) { - const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(), - nullptr /* get_context */, &index_block); + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), + /*get_context=*/nullptr, lookup_context, &index_block); if (!s.ok()) { return s; } @@ -616,10 +629,11 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* iter, GetContext* get_context) override { + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(read_options, get_context, &index_block); + const Status s = GetOrReadIndexBlock(read_options, get_context, + lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -1055,6 +1069,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Better not mutate rep_ after the creation. eg. internal_prefix_transform // raw pointer will be used to create HashIndexReader, whose reset may // access a dangling pointer. + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, internal_comparator, skip_filters, level, immortal_table); @@ -1095,13 +1110,13 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, return s; } s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(), - internal_comparator); + internal_comparator, &lookup_context); if (!s.ok()) { return s; } s = new_table->PrefetchIndexAndFilterBlocks( prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all, - table_options, level); + table_options, level, &lookup_context); if (s.ok()) { // Update tail prefetch stats @@ -1304,7 +1319,8 @@ Status BlockBasedTable::ReadPropertiesBlock( Status BlockBasedTable::ReadRangeDelBlock( FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - const InternalKeyComparator& internal_comparator) { + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context) { Status s; bool found_range_del_block; BlockHandle range_del_handle; @@ -1317,10 +1333,10 @@ Status BlockBasedTable::ReadRangeDelBlock( } else if (found_range_del_block && !range_del_handle.IsNull()) { ReadOptions read_options; std::unique_ptr iter(NewDataBlockIterator( - read_options, range_del_handle, nullptr /* input_iter */, - BlockType::kRangeDeletion, true /* key_includes_seq */, - true /* index_key_is_full */, nullptr /* get_context */, Status(), - prefetch_buffer)); + read_options, range_del_handle, + /*input_iter=*/nullptr, BlockType::kRangeDeletion, + /*key_includes_seq=*/true, /*index_key_is_full=*/true, + /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer)); assert(iter != nullptr); s = iter->status(); if (!s.ok()) { @@ -1370,7 +1386,8 @@ Status BlockBasedTable::ReadCompressionDictBlock( Status BlockBasedTable::PrefetchIndexAndFilterBlocks( FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, - const BlockBasedTableOptions& table_options, const int level) { + const BlockBasedTableOptions& table_options, const int level, + BlockCacheLookupContext* lookup_context) { Status s; // Find filter handle and filter type @@ -1440,7 +1457,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( IndexReader* index_reader = nullptr; if (s.ok()) { s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, - prefetch_index, pin_index, &index_reader); + prefetch_index, pin_index, &index_reader, + lookup_context); if (s.ok()) { assert(index_reader != nullptr); rep_->index_reader.reset(index_reader); @@ -1467,7 +1485,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (s.ok() && prefetch_filter) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = - new_table->GetFilter(rep_->table_prefix_extractor.get()); + new_table->GetFilter(rep_->table_prefix_extractor.get(), + /*prefetch_buffer=*/nullptr, /*no_io=*/false, + /*get_context=*/nullptr, lookup_context); if (filter_entry.GetValue() != nullptr && prefetch_all) { filter_entry.GetValue()->CacheDependencies( pin_all, rep_->table_prefix_extractor.get()); @@ -1653,8 +1673,7 @@ Status BlockBasedTable::GetDataBlockFromCache( size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, - &cache_handle); + &DeleteCachedEntry, &cache_handle); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1758,8 +1777,7 @@ Status BlockBasedTable::PutDataBlockToCache( size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, - &cache_handle, priority); + &DeleteCachedEntry, &cache_handle, priority); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1849,25 +1867,28 @@ FilterBlockReader* BlockBasedTable::ReadFilter( CachableEntry BlockBasedTable::GetFilter( const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context) const { + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { const BlockHandle& filter_blk_handle = rep_->filter_handle; const bool is_a_filter_partition = true; return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition, - no_io, get_context, prefix_extractor); + no_io, get_context, lookup_context, prefix_extractor); } CachableEntry BlockBasedTable::GetFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, bool no_io, GetContext* get_context, + BlockCacheLookupContext* /*lookup_context*/, const SliceTransform* prefix_extractor) const { + // TODO(haoyu): Trace filter block access here. // If cache_index_and_filter_blocks is false, filter should be pre-populated. // We will return rep_->filter anyway. rep_->filter can be nullptr if filter // read fails at Open() time. We don't want to reload again since it will // most probably fail again. if (!is_a_filter_partition && !rep_->table_options.cache_index_and_filter_blocks) { - return {rep_->filter.get(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; + return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr, + /*own_value=*/false}; } Cache* block_cache = rep_->table_options.block_cache.get(); @@ -1877,8 +1898,8 @@ CachableEntry BlockBasedTable::GetFilter( } if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { - return {rep_->filter_entry.GetValue(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; + return {rep_->filter_entry.GetValue(), /*cache=*/nullptr, + /*cache_handle=*/nullptr, /*own_value=*/false}; } PERF_TIMER_GUARD(read_filter_block_nanos); @@ -1920,12 +1941,13 @@ CachableEntry BlockBasedTable::GetFilter( } return {filter, cache_handle ? block_cache : nullptr, cache_handle, - false /* own_value */}; + /*own_value=*/false}; } CachableEntry BlockBasedTable::GetUncompressionDict( - FilePrefetchBuffer* prefetch_buffer, bool no_io, - GetContext* get_context) const { + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* /*lookup_context*/) const { + // TODO(haoyu): Trace the access on the uncompression dictionary here. if (!rep_->table_options.cache_index_and_filter_blocks) { // block cache is either disabled or not used for meta-blocks. In either // case, BlockBasedTableReader is the owner of the uncompression dictionary. @@ -1987,14 +2009,16 @@ CachableEntry BlockBasedTable::GetUncompressionDict( // differs from the one in mutable_cf_options and index type is HashBasedIndex InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* input_iter, GetContext* get_context) const { + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { assert(rep_ != nullptr); assert(rep_->index_reader != nullptr); // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. return rep_->index_reader->NewIterator(read_options, disable_prefix_seek, - input_iter, get_context); + input_iter, get_context, + lookup_context); } // Convert an index iterator value (i.e., an encoded BlockHandle) @@ -2005,7 +2029,7 @@ template TBlockIter* BlockBasedTable::NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, bool index_key_is_full, - GetContext* get_context, Status s, + GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -2017,7 +2041,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool no_io = (ro.read_tier == kBlockCacheTier); auto uncompression_dict_storage = - GetUncompressionDict(prefetch_buffer, no_io, get_context); + GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context); const UncompressionDict& uncompression_dict = uncompression_dict_storage.GetValue() == nullptr ? UncompressionDict::GetEmptyDict() @@ -2025,7 +2049,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( CachableEntry block; s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, - block_type, get_context); + block_type, get_context, lookup_context); if (!s.ok()) { assert(block.IsEmpty()); @@ -2093,7 +2117,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context) const { + GetContext* get_context, + BlockCacheLookupContext* /*lookup_context*/) const { + // TODO(haoyu): Trace data/index/range deletion block access here. assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); @@ -2169,7 +2195,7 @@ Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context) const { + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { assert(block_entry); assert(block_entry->IsEmpty()); @@ -2180,7 +2206,7 @@ Status BlockBasedTable::RetrieveBlock( block_type != BlockType::kIndex)) { s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, uncompression_dict, block_entry, - block_type, get_context); + block_type, get_context, lookup_context); if (!s.ok()) { return s; @@ -2271,7 +2297,8 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( bool BlockBasedTable::PrefixMayMatch( const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check) const { + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const { if (!rep_->filter_policy) { return true; } @@ -2295,7 +2322,9 @@ bool BlockBasedTable::PrefixMayMatch( Status s; // First, try check with full filter - auto filter_entry = GetFilter(prefix_extractor); + auto filter_entry = + GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false, + /*get_context=*/nullptr, lookup_context); FilterBlockReader* filter = filter_entry.GetValue(); bool filter_checked = true; if (filter != nullptr) { @@ -2304,7 +2333,7 @@ bool BlockBasedTable::PrefixMayMatch( may_match = filter->RangeMayExist( read_options.iterate_upper_bound, user_key, prefix_extractor, rep_->internal_comparator.user_comparator(), const_ikey_ptr, - &filter_checked, need_upper_bound_check); + &filter_checked, need_upper_bound_check, lookup_context); } else { // if prefix_extractor changed for block based filter, skip filter if (need_upper_bound_check) { @@ -2323,9 +2352,10 @@ bool BlockBasedTable::PrefixMayMatch( // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - std::unique_ptr> iiter( - NewIndexIterator(no_io_read_options, - /* need_upper_bound_check */ false)); + std::unique_ptr> iiter(NewIndexIterator( + no_io_read_options, + /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*need_upper_bound_check=*/nullptr, lookup_context)); iiter->Seek(internal_prefix); if (!iiter->Valid()) { @@ -2357,8 +2387,9 @@ bool BlockBasedTable::PrefixMayMatch( // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. BlockHandle handle = iiter->value(); - may_match = - filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); + may_match = filter->PrefixMayMatch( + prefix, prefix_extractor, handle.offset(), /*no_io=*/false, + /*const_key_ptr=*/nullptr, lookup_context); } } } @@ -2588,7 +2619,7 @@ void BlockBasedTableIterator::InitDataBlock() { table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, block_type_, key_includes_seq_, index_key_is_full_, - /* get_context */ nullptr, s, prefetch_buffer_.get()); + /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; if (read_options_.iterate_upper_bound != nullptr) { data_block_within_upper_bound_ = @@ -2682,6 +2713,9 @@ void BlockBasedTableIterator::CheckOutOfBound() { InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, bool for_compaction) { + BlockCacheLookupContext lookup_context{ + for_compaction ? BlockCacheLookupCaller::kCompaction + : BlockCacheLookupCaller::kUserIterator}; bool need_upper_bound_check = PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); if (arena == nullptr) { @@ -2690,7 +2724,8 @@ InternalIterator* BlockBasedTable::NewIterator( NewIndexIterator( read_options, need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch), + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, @@ -2700,7 +2735,9 @@ InternalIterator* BlockBasedTable::NewIterator( arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, - NewIndexIterator(read_options, need_upper_bound_check), + NewIndexIterator(read_options, need_upper_bound_check, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + &lookup_context), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, @@ -2724,7 +2761,8 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( bool BlockBasedTable::FullFilterKeyMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, const Slice& internal_key, const bool no_io, - const SliceTransform* prefix_extractor) const { + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { if (filter == nullptr || filter->IsBlockBased()) { return true; } @@ -2735,15 +2773,16 @@ bool BlockBasedTable::FullFilterKeyMayMatch( size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); - may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, - kNotValid, no_io, const_ikey_ptr); + may_match = + filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, + no_io, const_ikey_ptr, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && prefix_extractor->InDomain(user_key) && !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), prefix_extractor, kNotValid, false, - const_ikey_ptr)) { + const_ikey_ptr, lookup_context)) { may_match = false; } if (may_match) { @@ -2756,12 +2795,14 @@ bool BlockBasedTable::FullFilterKeyMayMatch( void BlockBasedTable::FullFilterKeysMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, MultiGetRange* range, const bool no_io, - const SliceTransform* prefix_extractor) const { + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { if (filter == nullptr || filter->IsBlockBased()) { return; } if (filter->whole_key_filtering()) { - filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io); + filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, + lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0) { @@ -2772,7 +2813,8 @@ void BlockBasedTable::FullFilterKeysMayMatch( range->SkipKey(iter); } } - filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false); + filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false, + lookup_context); } } @@ -2786,18 +2828,19 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, CachableEntry filter_entry; bool may_match; FilterBlockReader* filter = nullptr; + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet}; { if (!skip_filters) { - filter_entry = - GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, - read_options.read_tier == kBlockCacheTier, get_context); + filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, + read_options.read_tier == kBlockCacheTier, + get_context, &lookup_context); } filter = filter_entry.GetValue(); // First check the full filter // If full filter not useful, Then go into each block may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, - prefix_extractor); + prefix_extractor, &lookup_context); } if (!may_match) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); @@ -2811,8 +2854,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, need_upper_bound_check = PrefixExtractorChanged( rep_->table_properties.get(), prefix_extractor); } - auto iiter = NewIndexIterator(read_options, need_upper_bound_check, - &iiter_on_stack, get_context); + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + get_context, &lookup_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2828,7 +2872,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), - prefix_extractor, handle.offset(), no_io); + prefix_extractor, handle.offset(), no_io, + /*const_ikey_ptr=*/nullptr, &lookup_context); if (not_exist_in_filter) { // Not found @@ -2841,8 +2886,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, DataBlockIter biter; NewDataBlockIterator( read_options, iiter->value(), &biter, BlockType::kData, - true /* key_includes_seq */, true /* index_key_is_full */, - get_context); + /*key_includes_seq=*/true, + /*index_key_is_full=*/true, get_context, &lookup_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { @@ -2907,6 +2953,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet}; const bool no_io = read_options.read_tier == kBlockCacheTier; CachableEntry filter_entry; FilterBlockReader* filter = nullptr; @@ -2915,16 +2962,16 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, { if (!skip_filters) { // TODO: Figure out where the stats should go - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, + filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, read_options.read_tier == kBlockCacheTier, - nullptr /*get_context*/); + /*get_context=*/nullptr, &lookup_context); } filter = filter_entry.GetValue(); // First check the full filter // If full filter not useful, Then go into each block FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, - prefix_extractor); + prefix_extractor, &lookup_context); } if (skip_filters || !sst_file_range.empty()) { IndexBlockIter iiter_on_stack; @@ -2937,7 +2984,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, } auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, - sst_file_range.begin()->get_context); + sst_file_range.begin()->get_context, &lookup_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2958,11 +3005,12 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, offset = iiter->value().offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, false, - true /* key_includes_seq */, get_context); + read_options, iiter->value(), &biter, BlockType::kData, + /*key_includes_seq=*/false, + /*index_key_is_full=*/true, get_context, &lookup_context, + Status(), nullptr); reusing_block = false; } - if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { // couldn't get block from block_cache @@ -3040,9 +3088,11 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, if (begin && end && comparator.Compare(*begin, *end) > 0) { return Status::InvalidArgument(*begin, *end); } - + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); + auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + &iiter_on_stack, /*get_context=*/nullptr, + &lookup_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr = @@ -3077,7 +3127,12 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, // Load the block specified by the block_handle into the block cache DataBlockIter biter; - NewDataBlockIterator(ReadOptions(), block_handle, &biter); + + NewDataBlockIterator( + ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + /*key_includes_seq=*/true, /*index_key_is_full=*/true, + /*get_context=*/nullptr, &lookup_context, Status(), + /*prefetch_buffer=*/nullptr); if (!biter.status().ok()) { // there was an unexpected error while pre-fetching @@ -3089,6 +3144,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, } Status BlockBasedTable::VerifyChecksum() { + // TODO(haoyu): This function is called by external sst ingestion and the + // verify checksum public API. We don't log its block cache accesses for now. Status s; // Check Meta blocks std::unique_ptr meta; @@ -3104,8 +3161,9 @@ Status BlockBasedTable::VerifyChecksum() { } // Check Data blocks IndexBlockIter iiter_on_stack; - InternalIteratorBase* iiter = - NewIndexIterator(ReadOptions(), false, &iiter_on_stack); + InternalIteratorBase* iiter = NewIndexIterator( + ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, + /*get_context=*/nullptr, /*lookup_contex=*/nullptr); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr = @@ -3199,8 +3257,9 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr> iiter( - NewIndexIterator(options)); + std::unique_ptr> iiter(NewIndexIterator( + options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); iiter->Seek(key); assert(iiter->Valid()); @@ -3234,7 +3293,8 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { Status BlockBasedTable::CreateIndexReader( FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, - bool pin, IndexReader** index_reader) { + bool pin, IndexReader** index_reader, + BlockCacheLookupContext* lookup_context) { auto index_type_on_file = rep_->index_type; // kHashSearch requires non-empty prefix_extractor but bypass checking @@ -3246,11 +3306,13 @@ Status BlockBasedTable::CreateIndexReader( switch (index_type_on_file) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader); + prefetch, pin, index_reader, + lookup_context); } case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader); + prefetch, pin, index_reader, + lookup_context); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -3264,14 +3326,16 @@ Status BlockBasedTable::CreateIndexReader( ROCKS_LOG_WARN(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary search index."); - return BinarySearchIndexReader::Create( - this, prefetch_buffer, use_cache, prefetch, pin, index_reader); + return BinarySearchIndexReader::Create(this, prefetch_buffer, + use_cache, prefetch, pin, + index_reader, lookup_context); } meta_index_iter = meta_iter_guard.get(); } return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, - use_cache, prefetch, pin, index_reader); + use_cache, prefetch, pin, index_reader, + lookup_context); } default: { std::string error_message = @@ -3281,9 +3345,15 @@ Status BlockBasedTable::CreateIndexReader( } } -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, + bool for_compaction) { + BlockCacheLookupContext context( + for_compaction ? BlockCacheLookupCaller::kCompaction + : BlockCacheLookupCaller::kUserApproximateSize); std::unique_ptr> index_iter( - NewIndexIterator(ReadOptions())); + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/&context)); index_iter->Seek(key); uint64_t result; @@ -3319,7 +3389,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -3337,7 +3409,11 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value())); + ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, + /*type=*/BlockType::kData, + /*key_includes_seq=*/true, /*index_key_is_full=*/true, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); if (!s.ok()) { @@ -3545,7 +3621,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { "Index Details:\n" "--------------------------------------\n"); std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { out_file->Append("Can not read Index Block \n\n"); @@ -3594,7 +3672,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { out_file->Append("Can not read Index Block \n\n"); @@ -3628,7 +3708,11 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value())); + ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, + /*type=*/BlockType::kData, + /*key_includes_seq=*/true, /*index_key_is_full=*/true, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index d8319a3e711..a92289f9bee 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -113,17 +113,22 @@ class BlockBasedTable : public TableReader { bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check) const; + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). // @param skip_filters Disables loading/accessing the filter block - InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) override; + InternalIterator* NewIterator( + const ReadOptions&, const SliceTransform* prefix_extractor, + Arena* arena = nullptr, bool skip_filters = false, + // TODO(haoyu) 1. External SST ingestion sets for_compaction as false. 2. + // Compaction also sets it to false when paranoid_file_checks is true, + // i.e., it will populate the block cache with blocks in the new SST + // files. We treat those as a user is calling iterator for now. We should + // differentiate the callers. + bool for_compaction = false) override; FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; @@ -149,7 +154,7 @@ class BlockBasedTable : public TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) override; + uint64_t ApproximateOffsetOf(const Slice& key, bool for_compaction) override; bool TEST_BlockInCache(const BlockHandle& handle) const; @@ -193,7 +198,8 @@ class BlockBasedTable : public TableReader { // returned object. virtual InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* iter, GetContext* get_context) = 0; + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; // Report an approximation of how much memory has been used other than // memory that was allocated in block cache. @@ -222,10 +228,10 @@ class BlockBasedTable : public TableReader { template TBlockIter* NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& block_handle, - TBlockIter* input_iter = nullptr, BlockType block_type = BlockType::kData, - bool key_includes_seq = true, bool index_key_is_full = true, - GetContext* get_context = nullptr, Status s = Status(), - FilePrefetchBuffer* prefetch_buffer = nullptr) const; + TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, + bool index_key_is_full, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer) const; class PartitionedIndexIteratorState; @@ -262,7 +268,7 @@ class BlockBasedTable : public TableReader { FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context = nullptr) const; + GetContext* get_context, BlockCacheLookupContext* lookup_context) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the @@ -271,23 +277,25 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context) const; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file // were they not present in cache yet. CachableEntry GetFilter( - const SliceTransform* prefix_extractor = nullptr, - FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false, - GetContext* get_context = nullptr) const; + const SliceTransform* prefix_extractor, + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; virtual CachableEntry GetFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, bool no_io, GetContext* get_context, - const SliceTransform* prefix_extractor = nullptr) const; + BlockCacheLookupContext* lookup_context, + const SliceTransform* prefix_extractor) const; CachableEntry GetUncompressionDict( - FilePrefetchBuffer* prefetch_buffer, bool no_io, - GetContext* get_context) const; + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; // Get the iterator from the index reader. // If input_iter is not set, return new Iterator @@ -300,9 +308,9 @@ class BlockBasedTable : public TableReader { // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier InternalIteratorBase* NewIndexIterator( - const ReadOptions& read_options, bool need_upper_bound_check = false, - IndexBlockIter* input_iter = nullptr, - GetContext* get_context = nullptr) const; + const ReadOptions& read_options, bool need_upper_bound_check, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; // Read block cache from block caches (if set): block_cache and // block_cache_compressed. @@ -352,17 +360,20 @@ class BlockBasedTable : public TableReader { Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, - IndexReader** index_reader); + IndexReader** index_reader, + BlockCacheLookupContext* lookup_context); - bool FullFilterKeyMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - const Slice& user_key, const bool no_io, - const SliceTransform* prefix_extractor = nullptr) const; + bool FullFilterKeyMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& user_key, + const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; - void FullFilterKeysMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - MultiGetRange* range, const bool no_io, - const SliceTransform* prefix_extractor = nullptr) const; + void FullFilterKeysMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, MultiGetRange* range, + const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; static Status PrefetchTail( RandomAccessFileReader* file, uint64_t file_size, @@ -380,14 +391,16 @@ class BlockBasedTable : public TableReader { const SequenceNumber largest_seqno); Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - const InternalKeyComparator& internal_comparator); + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context); Status ReadCompressionDictBlock( FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* compression_dict_block) const; Status PrefetchIndexAndFilterBlocks( FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, - const BlockBasedTableOptions& table_options, const int level); + const BlockBasedTableOptions& table_options, const int level, + BlockCacheLookupContext* lookup_context); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); @@ -583,7 +596,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { block_type_(block_type), key_includes_seq_(key_includes_seq), index_key_is_full_(index_key_is_full), - for_compaction_(for_compaction) {} + for_compaction_(for_compaction), + lookup_context_(for_compaction + ? BlockCacheLookupCaller::kCompaction + : BlockCacheLookupCaller::kUserIterator) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -644,7 +660,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool CheckPrefixMayMatch(const Slice& ikey) { if (check_filter_ && !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, - need_upper_bound_check_)) { + need_upper_bound_check_, &lookup_context_)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to @@ -702,6 +718,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { // If this iterator is created for compaction bool for_compaction_; BlockHandle prev_index_value_; + BlockCacheLookupContext lookup_context_; // All the below fields control iterator readahead static const size_t kInitAutoReadaheadSize = 8 * 1024; diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 378cdacfff6..d54de5ae1ab 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -30,6 +30,7 @@ #include "rocksdb/table.h" #include "table/format.h" #include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" #include "util/hash.h" namespace rocksdb { @@ -99,18 +100,19 @@ class FilterBlockReader { */ virtual bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) = 0; + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) = 0; virtual void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; - if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey)) { + if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, + context)) { range->SkipKey(iter); } } @@ -121,19 +123,19 @@ class FilterBlockReader { */ virtual bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) = 0; + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) = 0; virtual void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey)) { + block_offset, no_io, &ikey, context)) { range->SkipKey(iter); } } @@ -156,13 +158,13 @@ class FilterBlockReader { virtual bool RangeMayExist( const Slice* /*iterate_upper_bound*/, const Slice& user_key, - const SliceTransform* prefix_extractor, - const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, - bool* filter_checked, bool /*need_upper_bound_check*/) { + const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool /*need_upper_bound_check*/, BlockCacheLookupContext* context) { *filter_checked = true; Slice prefix = prefix_extractor->Transform(user_key); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr); + const_ikey_ptr, context); } protected: diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 56dc74c6710..6d2b9d70a50 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -124,7 +124,8 @@ FullFilterBlockReader::FullFilterBlockReader( bool FullFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /*prefix_extractor*/, uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + const Slice* const /*const_ikey_ptr*/, + BlockCacheLookupContext* /*context*/) { #ifdef NDEBUG (void)block_offset; #endif @@ -138,7 +139,8 @@ bool FullFilterBlockReader::KeyMayMatch( bool FullFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + const Slice* const /*const_ikey_ptr*/, + BlockCacheLookupContext* /*context*/) { #ifdef NDEBUG (void)block_offset; #endif @@ -161,7 +163,8 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/) { + uint64_t block_offset, const bool /*no_io*/, + BlockCacheLookupContext* /*context*/) { #ifdef NDEBUG (void)range; (void)block_offset; @@ -177,7 +180,8 @@ void FullFilterBlockReader::KeysMayMatch( void FullFilterBlockReader::PrefixesMayMatch( MultiGetRange* range, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/) { + uint64_t block_offset, const bool /*no_io*/, + BlockCacheLookupContext* /*context*/) { #ifdef NDEBUG (void)range; (void)block_offset; @@ -224,10 +228,11 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const { return usage; } -bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, - const Slice& user_key, const SliceTransform* prefix_extractor, - const Comparator* comparator, const Slice* const const_ikey_ptr, - bool* filter_checked, bool need_upper_bound_check) { +bool FullFilterBlockReader::RangeMayExist( + const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, BlockCacheLookupContext* context) { if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { *filter_checked = false; return true; @@ -240,7 +245,7 @@ bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, } else { *filter_checked = true; return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr); + const_ikey_ptr, context); } } diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 61df028c920..99e5299b34f 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -95,35 +95,38 @@ class FullFilterBlockReader : public FilterBlockReader { // bits_reader is created in filter_policy, it should be passed in here // directly. and be deleted here - ~FullFilterBlockReader() {} + ~FullFilterBlockReader() override {} + + bool IsBlockBased() override { return false; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + + void KeysMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* context) override; + + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* context) override; + size_t ApproximateMemoryUsage() const override; + bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, + BlockCacheLookupContext* context) override; - virtual bool IsBlockBased() override { return false; } - - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - - virtual void KeysMayMatch(MultiGetRange* range, - const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) override; - - virtual void PrefixesMayMatch(MultiGetRange* range, - const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) override; - virtual size_t ApproximateMemoryUsage() const override; - virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, - const SliceTransform* prefix_extractor, - const Comparator* comparator, - const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check) override; private: const SliceTransform* prefix_extractor_; Slice contents_; diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 82c43b34ed6..57ff158c5c7 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -112,7 +112,9 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { nullptr, true, block, table_options_.filter_policy->GetFilterBitsReader(block), nullptr); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { @@ -127,13 +129,27 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { FullFilterBlockReader reader( nullptr, true, block, table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } class FullFilterBlockTest : public testing::Test { @@ -157,7 +173,9 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) { nullptr, true, block, table_options_.filter_policy->GetFilterBitsReader(block), nullptr); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } TEST_F(FullFilterBlockTest, DuplicateEntries) { @@ -207,13 +225,27 @@ TEST_F(FullFilterBlockTest, SingleChunk) { FullFilterBlockReader reader( nullptr, true, block, table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); } } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 7874ce1874f..e80085dfb5b 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -162,8 +162,8 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { bool PartitionedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) { assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); if (!whole_key_filtering_) { @@ -177,19 +177,20 @@ bool PartitionedFilterBlockReader::KeyMayMatch( return false; } auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - prefix_extractor); + GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, + prefix_extractor, context); if (UNLIKELY(!filter_partition.GetValue())) { return true; } - return filter_partition.GetValue()->KeyMayMatch(key, prefix_extractor, - block_offset, no_io); + return filter_partition.GetValue()->KeyMayMatch( + key, prefix_extractor, block_offset, no_io, /*const_ikey_ptr=*/nullptr, + context); } bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) { #ifdef NDEBUG (void)block_offset; #endif @@ -206,13 +207,14 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( return false; } auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - prefix_extractor); + GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, + prefix_extractor, context); if (UNLIKELY(!filter_partition.GetValue())) { return true; } - return filter_partition.GetValue()->PrefixMayMatch(prefix, prefix_extractor, - kNotValid, no_io); + return filter_partition.GetValue()->PrefixMayMatch( + prefix, prefix_extractor, kNotValid, no_io, /*const_ikey_ptr=*/nullptr, + context); } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( @@ -234,7 +236,8 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( CachableEntry PartitionedFilterBlockReader::GetFilterPartition( FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, - const bool no_io, const SliceTransform* prefix_extractor) { + const bool no_io, const SliceTransform* prefix_extractor, + BlockCacheLookupContext* context) { const bool is_a_filter_partition = true; auto block_cache = table_->rep_->table_options.block_cache.get(); if (LIKELY(block_cache != nullptr)) { @@ -247,9 +250,10 @@ PartitionedFilterBlockReader::GetFilterPartition( nullptr /* cache_handle */, false /* own_value */}; } } - return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle, + return table_->GetFilter(/*prefetch_buffer=*/nullptr, fltr_blk_handle, is_a_filter_partition, no_io, - /* get_context */ nullptr, prefix_extractor); + /*get_context=*/nullptr, context, + prefix_extractor); } else { auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, is_a_filter_partition, prefix_extractor); @@ -273,6 +277,7 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { void PartitionedFilterBlockReader::CacheDependencies( bool pin, const SliceTransform* prefix_extractor) { // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; IndexBlockIter biter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( @@ -304,7 +309,7 @@ void PartitionedFilterBlockReader::CacheDependencies( const bool is_a_filter_partition = true; auto filter = table_->GetFilter( prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, - /* get_context */ nullptr, prefix_extractor); + /*get_context=*/nullptr, &lookup_context, prefix_extractor); if (LIKELY(filter.IsCached())) { if (pin) { filter_map_[handle.offset()] = std::move(filter); diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 6860bf82fec..4b0fb523d0d 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -77,26 +77,28 @@ class PartitionedFilterBlockReader : public FilterBlockReader { Statistics* stats, const InternalKeyComparator comparator, const BlockBasedTable* table, const bool index_key_includes_seq, const bool index_value_is_full); - virtual ~PartitionedFilterBlockReader(); + ~PartitionedFilterBlockReader() override; - virtual bool IsBlockBased() override { return false; } - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual size_t ApproximateMemoryUsage() const override; + bool IsBlockBased() override { return false; } + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + BlockCacheLookupContext* context) override; + size_t ApproximateMemoryUsage() const override; private: BlockHandle GetFilterPartitionHandle(const Slice& entry); CachableEntry GetFilterPartition( FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, - const bool no_io, const SliceTransform* prefix_extractor = nullptr); - virtual void CacheDependencies( - bool bin, const SliceTransform* prefix_extractor) override; + const bool no_io, const SliceTransform* prefix_extractor, + BlockCacheLookupContext* context); + void CacheDependencies(bool bin, + const SliceTransform* prefix_extractor) override; const SliceTransform* prefix_extractor_; std::unique_ptr idx_on_fltr_blk_; diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 70e5bbd3bbd..5af7034968a 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -31,6 +31,7 @@ class MockedBlockBasedTable : public BlockBasedTable { CachableEntry GetFilter( FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, const bool /* unused */, bool /* unused */, GetContext* /* unused */, + BlockCacheLookupContext* /*context*/, const SliceTransform* prefix_extractor) const override { Slice slice = slices[filter_blk_handle.offset()]; auto obj = new FullFilterBlockReader( @@ -168,14 +169,15 @@ class PartitionedFilterBlockTest auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, - &ikey_slice)); + &ikey_slice, /*context=*/nullptr)); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + !no_io, &ikey_slice, + /*context=*/nullptr)); } // querying missing keys for (auto key : missing_keys) { @@ -183,11 +185,13 @@ class PartitionedFilterBlockTest const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + !no_io, &ikey_slice, + /*context=*/nullptr)); } else { // assuming a good hash function ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + !no_io, &ikey_slice, + /*context=*/nullptr)); } } } @@ -335,9 +339,9 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { for (auto key : pkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), - prefix_extractor.get(), kNotValid, - false /*no_io*/, &ikey_slice)); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*context=*/nullptr)); } } diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index b37d46373e1..0080a76e158 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -56,7 +56,10 @@ class CuckooTableReader: public TableReader { size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + bool /*for_compaction*/ = false) override { + return 0; + } void SetupForCompaction() override {} // End of methods not implemented. diff --git a/table/mock_table.h b/table/mock_table.h index 42e28266d99..005de1c3dc2 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -50,9 +50,12 @@ class MockTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + bool /*for_compaction*/ = false) override { + return 0; + } - virtual size_t ApproximateMemoryUsage() const override { return 0; } + size_t ApproximateMemoryUsage() const override { return 0; } void SetupForCompaction() override {} diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 38852059bf9..15f7be1c253 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -613,7 +613,8 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) { +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, + bool /*for_compaction*/) { return 0; } diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 6c1c12ab8bb..774e2eb36ef 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -89,7 +89,8 @@ class PlainTableReader: public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key) override; + uint64_t ApproximateOffsetOf(const Slice& key, + bool for_compaction = false) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; diff --git a/table/table_reader.h b/table/table_reader.h index 037dbc33818..bf3289818d6 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -61,7 +61,8 @@ class TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + virtual uint64_t ApproximateOffsetOf(const Slice& key, + bool for_compaction = false) = 0; // Set up the table for Compaction. Might change some parameters with // posix_fadvise diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 7b3c82e2b7e..5fd14cbf11b 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -17,12 +17,38 @@ enum BlockCacheLookupCaller : char { kUserGet = 1, kUserMGet = 2, kUserIterator = 3, - kPrefetch = 4, - kCompaction = 5, + kUserApproximateSize = 4, + kPrefetch = 5, + kCompaction = 6, // All callers should be added before kMaxBlockCacheLookupCaller. kMaxBlockCacheLookupCaller }; +// Lookup context for tracing block cache accesses. +// We trace block accesses at five places: +// 1. BlockBasedTable::GetFilter +// 2. BlockBasedTable::GetUncompressedDict. +// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, +// and range deletion block.) +// 4. BlockBasedTable::Get. (To trace the referenced key and whether the +// referenced key exists in a fetched data block.) +// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the +// referenced key exists in a fetched data block.) +// The context is created at: +// 1. BlockBasedTable::Get. (kUserGet) +// 2. BlockBasedTable::MultiGet. (kUserMGet) +// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or +// external SST ingestion calls this function.) +// 4. BlockBasedTable::Open. (kPrefetch) +// 5. Index/Filter::CacheDependencies. (kPrefetch) +// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or +// kUserApproximateSize). +struct BlockCacheLookupContext { + BlockCacheLookupContext(const BlockCacheLookupCaller& _caller) + : caller(_caller) {} + const BlockCacheLookupCaller caller; +}; + enum Boolean : char { kTrue = 1, kFalse = 0 }; struct BlockCacheTraceRecord { From 641cc8d541685cad1629bd99bc08ca958458d456 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 10 Jun 2019 15:53:46 -0700 Subject: [PATCH 124/572] Use CreateLoggerFromOptions function (#5427) Summary: Use `CreateLoggerFromOptions` function to reduce code duplication. Test plan (on my machine) ``` $make clean && make -j32 db_secondary_test $KEEP_DB=1 ./db_secondary_test ``` Verify all info logs of the secondary instance are properly logged. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5427 Differential Revision: D15748922 Pulled By: riversand963 fbshipit-source-id: bad7261df1b8373efc504f141efc7871e375a311 --- db/db_impl/db_impl_secondary.cc | 35 +++++---------------------------- 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index eb8c4c98738..2737df0ae8c 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -521,39 +521,14 @@ Status DB::OpenAsSecondary( } DBOptions tmp_opts(db_options); + Status s; if (nullptr == tmp_opts.info_log) { - Env* env = tmp_opts.env; - assert(env != nullptr); - std::string secondary_abs_path; - env->GetAbsolutePath(secondary_path, &secondary_abs_path); - std::string fname = InfoLogFileName(secondary_path, secondary_abs_path, - tmp_opts.db_log_dir); - - env->CreateDirIfMissing(secondary_path); - if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) { - AutoRollLogger* result = new AutoRollLogger( - env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size, - tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level); - Status s = result->GetStatus(); - if (!s.ok()) { - delete result; - } else { - tmp_opts.info_log.reset(result); - } - } - if (nullptr == tmp_opts.info_log) { - env->RenameFile( - fname, OldInfoLogFileName(secondary_path, env->NowMicros(), - secondary_abs_path, tmp_opts.db_log_dir)); - Status s = env->NewLogger(fname, &(tmp_opts.info_log)); - if (tmp_opts.info_log != nullptr) { - tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level); - } + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; } } - assert(tmp_opts.info_log != nullptr); - handles->clear(); DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); impl->versions_.reset(new ReactiveVersionSet( @@ -563,7 +538,7 @@ Status DB::OpenAsSecondary( impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); impl->mutex_.Lock(); - Status s = impl->Recover(column_families, true, false, false); + s = impl->Recover(column_families, true, false, false); if (s.ok()) { for (auto cf : column_families) { auto cfd = From b2584577fa66ccb16c3b67a0347188d2474660ce Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 10 Jun 2019 16:46:04 -0700 Subject: [PATCH 125/572] Remove global locks from FlushScheduler (#5372) Summary: FlushScheduler's methods are instrumented with debug-time locks to check the scheduler state against a simple container definition. Since https://github.com/facebook/rocksdb/pull/2286 the scope of such locks are widened to the entire methods' body. The result is that the concurrency tested during testing (in debug mode) is stricter than the concurrency level manifested at runtime (in release mode). The patch reverts this change to reduce the scope of such locks. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5372 Differential Revision: D15545831 Pulled By: maysamyabandeh fbshipit-source-id: 01d69191afb1dd807d4bdc990fc74813ae7b5426 --- db/flush_scheduler.cc | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc index 8735a6b369b..9c6c04efe33 100644 --- a/db/flush_scheduler.cc +++ b/db/flush_scheduler.cc @@ -13,9 +13,11 @@ namespace rocksdb { void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { #ifndef NDEBUG - std::lock_guard lock(checking_mutex_); - assert(checking_set_.count(cfd) == 0); - checking_set_.insert(cfd); + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } #endif // NDEBUG cfd->Ref(); // Suppress false positive clang analyzer warnings. @@ -32,9 +34,6 @@ void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { } ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { -#ifndef NDEBUG - std::lock_guard lock(checking_mutex_); -#endif // NDEBUG while (true) { if (head_.load(std::memory_order_relaxed) == nullptr) { return nullptr; @@ -47,9 +46,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { delete node; #ifndef NDEBUG - auto iter = checking_set_.find(cfd); - assert(iter != checking_set_.end()); - checking_set_.erase(iter); + { + std::lock_guard lock(checking_mutex_); + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } #endif // NDEBUG if (!cfd->IsDropped()) { @@ -65,12 +67,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { } bool FlushScheduler::Empty() { -#ifndef NDEBUG - std::lock_guard lock(checking_mutex_); -#endif // NDEBUG auto rv = head_.load(std::memory_order_relaxed) == nullptr; #ifndef NDEBUG - assert(rv == checking_set_.empty()); + std::lock_guard lock(checking_mutex_); + // Empty is allowed to be called concurrnetly with ScheduleFlush. It would + // only miss the recent schedules. + assert((rv == checking_set_.empty()) || rv); #endif // NDEBUG return rv; } From c8c1a549f0cf88fa3d7c82d1f0d96d4b7dcffbf1 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 10 Jun 2019 17:02:23 -0700 Subject: [PATCH 126/572] Avoid deadlock between mutex_ and log_write_mutex_ (#5437) Summary: To avoid deadlock mutex_ should never be acquired before log_write_mutex_. The patch documents that and also fixes one case in ::FlushWAL that acquires mutex_ through ::WriteStatusCheck when it already holds lock on log_write_mutex_. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5437 Differential Revision: D15749722 Pulled By: maysamyabandeh fbshipit-source-id: f57b69c44b4b80cc6d7ddf3d3fdf4a9eb5a5a45a --- db/db_impl/db_impl.cc | 11 +++++++---- db/db_impl/db_impl.h | 2 ++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index b1a828f9f0e..0a480a4a2eb 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1046,10 +1046,13 @@ int DBImpl::FindMinimumEmptyLevelFitting( Status DBImpl::FlushWAL(bool sync) { if (manual_wal_flush_) { - // We need to lock log_write_mutex_ since logs_ might change concurrently - InstrumentedMutexLock wl(&log_write_mutex_); - log::Writer* cur_log_writer = logs_.back().writer; - auto s = cur_log_writer->WriteBuffer(); + Status s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + } if (!s.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", s.ToString().c_str()); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 111a91e04f3..4c80b6a4d0c 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1521,6 +1521,8 @@ class DBImpl : public DB { // logfile_number_. With two_write_queues it also protects alive_log_files_, // and log_empty_. Refer to the definition of each variable below for more // details. + // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; std::atomic shutting_down_; From a94aef6596f876561b28aad7cdcd0c92f04cc1d6 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Mon, 10 Jun 2019 18:43:32 -0700 Subject: [PATCH 127/572] Fix DBTest.DynamicMiscOptions so it passes even with Snappy disabled (#5438) Summary: This affects our "no compression" automated tests. Since PR #5368, DBTest.DynamicMiscOptions has been failing with: db/db_test.cc:4889: Failure dbfull()->SetOptions({{"compression", "kSnappyCompression"}}) Invalid argument: Compression type Snappy is not linked with the binary. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5438 Differential Revision: D15752100 Pulled By: ltamasi fbshipit-source-id: 3f19eff7cafc03b333965be0203c5853d2a9cb71 --- db/db_test.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 27cf790ee57..a27a5eeb97f 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4884,14 +4884,15 @@ TEST_F(DBTest, DynamicMiscOptions) { ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], &mutable_cf_options)); ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); - // Appveyor fails with: Compression type Snappy is not linked with the binary -#ifndef OS_WIN - ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); - ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], - &mutable_cf_options)); - ASSERT_EQ(CompressionType::kSnappyCompression, - mutable_cf_options.compression); -#endif + + if (Snappy_Supported()) { + ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kSnappyCompression, + mutable_cf_options.compression); + } + // Test paranoid_file_checks already done in db_block_cache_test ASSERT_OK( dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); From 58c4aee42e9ebe008efa2cfdfad107206879446c Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 11 Jun 2019 11:42:19 -0700 Subject: [PATCH 128/572] TransactionUtil::CheckKey() to skip unnecessary history (#4941) Summary: If a memtable definitely covers a key, there isn't a need to check older memtables. We can skip them by checking the earliest sequence number. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4941 Differential Revision: D13932666 fbshipit-source-id: b9d52f234b8ad9dd3bf6547645cd457175a3ca9b --- db/db_impl/db_impl.cc | 22 ++- db/db_impl/db_impl.h | 8 +- utilities/blob_db/blob_db_impl.cc | 4 +- .../optimistic_transaction_test.cc | 118 +++++++++++++++ utilities/transactions/transaction_util.cc | 19 ++- utilities/transactions/transaction_util.h | 11 ++ .../write_prepared_transaction_test.cc | 141 ++++++++++++++++++ 7 files changed, 318 insertions(+), 5 deletions(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 0a480a4a2eb..27d48539c35 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -3412,7 +3412,9 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, #ifndef ROCKSDB_LITE Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, - bool cache_only, SequenceNumber* seq, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, bool* found_record_for_key, bool* is_blob_index) { Status s; @@ -3445,6 +3447,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + // Check if there is a record for this key in the immutable memtables sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, seq, read_options, nullptr /*read_callback*/, is_blob_index); @@ -3464,6 +3473,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + // Check if there is a record for this key in the immutable memtables sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, seq, read_options, @@ -3485,6 +3501,10 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + // TODO(agiardullo): possible optimization: consider checking cached // SST files if cache_only=true? if (!cache_only) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 4c80b6a4d0c..4de15f0324d 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -413,11 +413,17 @@ class DBImpl : public DB { // snapshot, we know that no key could have existing after this snapshot // (since we do not compact keys that have an earlier snapshot). // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // // Returns OK or NotFound on success, // other status on unexpected error. // TODO(andrewkr): this API need to be aware of range deletion operations Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, - bool cache_only, SequenceNumber* seq, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, bool* found_record_for_key, bool* is_blob_index = nullptr); diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 25583fa981a..86eb1460c15 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1426,8 +1426,8 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback { bool found_record_for_key = false; bool is_blob_index = false; Status s = db_impl->GetLatestSequenceForKey( - sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key, - &is_blob_index); + sv, key_, false /*cache_only*/, 0 /*lower_bound_seq*/, &latest_seq, + &found_record_for_key, &is_blob_index); db_impl->ReturnAndCleanupSuperVersion(cfd_, sv); if (!s.ok() && !s.IsNotFound()) { // Error. diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 5e1af2fb1f5..3aa6c207a48 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -9,11 +9,15 @@ #include #include + +#include "db/db_impl/db_impl.h" #include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" +#include "rocksdb/perf_context.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" +#include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/transaction_test_util.h" #include "util/crc32c.h" @@ -308,6 +312,120 @@ TEST_F(OptimisticTransactionTest, FlushTest2) { delete txn; } +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_F(OptimisticTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + options.max_write_buffer_number_to_maintain = 3; + Reopen(); + + WriteOptions write_options; + ReadOptions read_options; + ReadOptions snapshot_read_options; + ReadOptions snapshot_read_options2; + string value; + Status s; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn != nullptr); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn2 != nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + snapshot_read_options2.snapshot = txn2->GetSnapshot(); + ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2"))); + + // txn updates "foo" and txn2 updates "foo2", and now a write is + // issued for "foo", which conflicts with txn but not txn2 + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"OptimisticTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(txn_db->Flush(flush_ops)); + } else { + assert(attempt == kAttemptImmMemTable); + DBImpl* db_impl = static_cast(txn_db->GetRootDB()); + db_impl->TEST_SwitchMemtable(); + } + uint64_t num_imm_mems; + ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + assert(attempt == kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, when this transaction is commited, + // only need to check the active memtable + Transaction* txn3 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn3 != nullptr); + + // Commit both of txn and txn2. txn will conflict but txn2 will + // pass. In both ways, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + + get_perf_context()->Reset(); + s = txn->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + // txn should fail because of conflict, even if the memtable + // has flushed, because it is still preserved in history. + ASSERT_TRUE(s.IsBusy()); + + get_perf_context()->Reset(); + s = txn2->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + txn3->Put(Slice("foo2"), Slice("bar2")); + get_perf_context()->Reset(); + s = txn3->Commit(); + // txn3 is created after the active memtable is created, so that is the only + // memtable to check. + ASSERT_EQ(1, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable"); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + } +} + TEST_F(OptimisticTransactionTest, NoSnapshotTest) { WriteOptions write_options; ReadOptions read_options; diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc index 407feaaa88a..ba3b75e15bf 100644 --- a/utilities/transactions/transaction_util.cc +++ b/utilities/transactions/transaction_util.cc @@ -52,6 +52,12 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, const std::string& key, bool cache_only, ReadCallback* snap_checker, SequenceNumber min_uncommitted) { + // When `min_uncommitted` is provided, keys are not always committed + // in sequence number order, and `snap_checker` is used to check whether + // specific sequence number is in the database is visible to the transaction. + // So `snap_checker` must be provided. + assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr); + Status result; bool need_to_read_sst = false; @@ -100,8 +106,19 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, SequenceNumber seq = kMaxSequenceNumber; bool found_record_for_key = false; + // When min_uncommitted == kMaxSequenceNumber, writes are committed in + // sequence number order, so only keys larger than `snap_seq` can cause + // conflict. + // When min_uncommitted != kMaxSequenceNumber, keys lower than + // min_uncommitted will not triggered conflicts, while keys larger than + // min_uncommitted might create conflicts, so we need to read them out + // from the DB, and call callback to snap_checker to determine. So only + // keys lower than min_uncommitted can be skipped. + SequenceNumber lower_bound_seq = + (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted; Status s = db_impl->GetLatestSequenceForKey(sv, key, !need_to_read_sst, - &seq, &found_record_for_key); + lower_bound_seq, &seq, + &found_record_for_key); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { result = s; diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h index 0fe0e87d862..1d910134b66 100644 --- a/utilities/transactions/transaction_util.h +++ b/utilities/transactions/transaction_util.h @@ -50,6 +50,9 @@ class TransactionUtil { // SST files. This will make it more likely this function will // return an error if it is unable to determine if there are any conflicts. // + // See comment of CheckKey() for explanation of `snap_seq`, `snap_checker` + // and `min_uncommitted`. + // // Returns OK on success, BUSY if there is a conflicting write, or other error // status for any unexpected errors. static Status CheckKeyForConflicts( @@ -72,6 +75,14 @@ class TransactionUtil { bool cache_only); private: + // If `snap_checker` == nullptr, writes are always commited in sequence number + // order. All sequence number <= `snap_seq` will not conflict with any + // write, and all keys > `snap_seq` of `key` will trigger conflict. + // If `snap_checker` != nullptr, writes may not commit in sequence number + // order. In this case `min_uncommitted` is a lower bound. + // seq < `min_uncommitted`: no conflict + // seq > `snap_seq`: applicable to conflict + // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine. static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, SequenceNumber earliest_seq, SequenceNumber snap_seq, const std::string& key, bool cache_only, diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index e62b8344169..88f4ea032a9 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -761,6 +761,147 @@ TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) { MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); } +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + options.max_write_buffer_number_to_maintain = 3; + ReOpen(); + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.set_snapshot = true; + string value; + Status s; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn != nullptr); + ASSERT_OK(txn->SetName("txn")); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2 != nullptr); + ASSERT_OK(txn2->SetName("txn2")); + + // This transaction is created to cause potential conflict. + Transaction* txn_x = db->BeginTransaction(write_options); + ASSERT_OK(txn_x->SetName("txn_x")); + ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3"))); + ASSERT_OK(txn_x->Prepare()); + + // Create snapshots after the prepare, but there should still + // be a conflict when trying to read "foo". + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(db->Flush(flush_ops)); + } else { + assert(attempt == kAttemptImmMemTable); + DBImpl* db_impl = static_cast(db->GetRootDB()); + db_impl->TEST_SwitchMemtable(); + } + uint64_t num_imm_mems; + ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + assert(attempt == kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, but this transaction also needs to + // check all memtables because of they contains uncommitted data. + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn3 != nullptr); + ASSERT_OK(txn3->SetName("txn3")); + + // Commit the pending write + ASSERT_OK(txn_x->Commit()); + + // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will + // pass. In all cases, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + // We should have checked two memtables, and since there is no + // conflict, another Get() will be made and fetch the data from + // DB. If it is in immutable memtable, two extra memtable reads + // will be issued. If it is not (in history), only one will + // be made, which is to the active memtable. + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } else { + assert(attempt == kAttemptImmMemTable); + ASSERT_EQ(4, get_perf_context()->get_from_memtable_count); + } + + Transaction* txn4 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn4 != nullptr); + ASSERT_OK(txn4->SetName("txn4")); + get_perf_context()->Reset(); + ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value)); + if (attempt == kAttemptHistoryMemtable) { + // Active memtable will be checked in snapshot validation and when + // getting the value. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + } else { + // Only active memtable will be checked in snapshot validation but + // both of active and immutable snapshot will be queried when + // getting the value. + assert(attempt == kAttemptImmMemTable); + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } + + ASSERT_OK(txn2->Commit()); + ASSERT_OK(txn4->Commit()); + + TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable"); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + delete txn4; + delete txn_x; + } +} + // Reproduce the bug with two snapshots with the same seuqence number and test // that the release of the first snapshot will not affect the reads by the other // snapshot From 9bbccda01e127c942c71c3c7fc21c494a2fd1992 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Tue, 11 Jun 2019 12:18:37 -0700 Subject: [PATCH 129/572] First commit for block cache trace analyzer (#5425) Summary: This PR contains the first commit for block cache trace analyzer. It reads a block cache trace file and prints statistics of the traces. We will extend this class to provide more functionalities. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5425 Differential Revision: D15709580 Pulled By: HaoyuHuang fbshipit-source-id: 2f43bd2311f460ab569880819d95eeae217c20bb --- CMakeLists.txt | 2 + Makefile | 4 + src.mk | 2 + tools/block_cache_trace_analyzer.cc | 408 +++++++++++++++++++++++ tools/block_cache_trace_analyzer.h | 131 ++++++++ tools/block_cache_trace_analyzer_test.cc | 229 +++++++++++++ trace_replay/block_cache_tracer.cc | 3 +- trace_replay/block_cache_tracer.h | 2 + 8 files changed, 780 insertions(+), 1 deletion(-) create mode 100644 tools/block_cache_trace_analyzer.cc create mode 100644 tools/block_cache_trace_analyzer.h create mode 100644 tools/block_cache_trace_analyzer_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index cef1f85d797..006f6798666 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -621,6 +621,7 @@ set(SOURCES test_util/sync_point_impl.cc test_util/testutil.cc test_util/transaction_test_util.cc + tools/block_cache_trace_analyzer.cc tools/db_bench_tool.cc tools/dump/db_dump_tool.cc tools/ldb_cmd.cc @@ -966,6 +967,7 @@ if(WITH_TESTS) table/merger_test.cc table/sst_file_reader_test.cc table/table_test.cc + tools/block_cache_trace_analyzer_test.cc tools/ldb_cmd_test.cc tools/reduce_levels_test.cc tools/sst_dump_test.cc diff --git a/Makefile b/Makefile index 3ee85ad67d0..425c75eb5f5 100644 --- a/Makefile +++ b/Makefile @@ -562,6 +562,7 @@ TESTS = \ sst_file_reader_test \ db_secondary_test \ block_cache_tracer_test \ + block_cache_trace_analyzer_test \ PARALLEL_TEST = \ backupable_db_test \ @@ -1592,6 +1593,9 @@ db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +block_cache_trace_analyzer_test: tools/block_cache_trace_analyzer_test.o tools/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local diff --git a/src.mk b/src.mk index 6303997cd59..150b1c10af9 100644 --- a/src.mk +++ b/src.mk @@ -240,6 +240,7 @@ TOOL_LIB_SOURCES = \ utilities/blob_db/blob_dump_tool.cc \ ANALYZER_LIB_SOURCES = \ + tools/block_cache_trace_analyzer.cc \ tools/trace_analyzer_tool.cc \ MOCK_LIB_SOURCES = \ @@ -365,6 +366,7 @@ MAIN_SOURCES = \ table/table_reader_bench.cc \ table/table_test.cc \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + tools/block_cache_trace_analyzer_test.cc \ tools/db_bench.cc \ tools/db_bench_tool_test.cc \ tools/db_sanity_test.cc \ diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc new file mode 100644 index 00000000000..5d9b2d18409 --- /dev/null +++ b/tools/block_cache_trace_analyzer.cc @@ -0,0 +1,408 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "tools/block_cache_trace_analyzer.h" + +#include +#include +#include "monitoring/histogram.h" + +namespace rocksdb { +namespace { +std::string block_type_to_string(TraceType type) { + switch (type) { + case kBlockTraceFilterBlock: + return "Filter"; + case kBlockTraceDataBlock: + return "Data"; + case kBlockTraceIndexBlock: + return "Index"; + case kBlockTraceRangeDeletionBlock: + return "RangeDeletion"; + case kBlockTraceUncompressionDictBlock: + return "UncompressionDict"; + default: + break; + } + // This cannot happen. + return "InvalidType"; +} + +std::string caller_to_string(BlockCacheLookupCaller caller) { + switch (caller) { + case kUserGet: + return "Get"; + case kUserMGet: + return "MultiGet"; + case kUserIterator: + return "Iterator"; + case kPrefetch: + return "Prefetch"; + case kCompaction: + return "Compaction"; + default: + break; + } + // This cannot happen. + return "InvalidCaller"; +} +} // namespace + +BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( + const std::string& trace_file_path) + : trace_file_path_(trace_file_path) { + env_ = rocksdb::Env::Default(); +} + +void BlockCacheTraceAnalyzer::RecordAccess( + const BlockCacheTraceRecord& access) { + ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name]; + SSTFileAccessInfoAggregate& file_aggr = + cf_aggr.fd_aggregates_map[access.sst_fd_number]; + file_aggr.level = access.level; + BlockTypeAccessInfoAggregate& block_type_aggr = + file_aggr.block_type_aggregates_map[access.block_type]; + BlockAccessInfo& block_access_info = + block_type_aggr.block_access_info_map[access.block_key]; + block_access_info.AddAccess(access); +} + +Status BlockCacheTraceAnalyzer::Analyze() { + std::unique_ptr trace_reader; + Status s = + NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + if (!s.ok()) { + return s; + } + BlockCacheTraceReader reader(std::move(trace_reader)); + s = reader.ReadHeader(&header_); + if (!s.ok()) { + return s; + } + while (s.ok()) { + BlockCacheTraceRecord access; + s = reader.ReadAccess(&access); + if (!s.ok()) { + return s; + } + RecordAccess(access); + } + return Status::OK(); +} + +void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { + HistogramStat bs_stats; + std::map bt_stats_map; + std::map> cf_bt_stats_map; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + bs_stats.Add(block_access_info.second.block_size); + bt_stats_map[type].Add(block_access_info.second.block_size); + cf_bt_stats_map[cf_name][type].Add( + block_access_info.second.block_size); + } + } + } + } + fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str()); + for (auto const& bt_stats : bt_stats_map) { + fprintf(stdout, "Block size stats for block type %s: \n%s", + block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + for (auto const& cf_bt_stats : cf_bt_stats_map) { + const std::string& cf_name = cf_bt_stats.first; + for (auto const& bt_stats : cf_bt_stats.second) { + fprintf(stdout, + "Block size stats for column family %s and block type %s: \n%s", + cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + } +} + +void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { + HistogramStat access_stats; + std::map bt_stats_map; + std::map> cf_bt_stats_map; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + access_stats.Add(block_access_info.second.num_accesses); + bt_stats_map[type].Add(block_access_info.second.num_accesses); + cf_bt_stats_map[cf_name][type].Add( + block_access_info.second.num_accesses); + } + } + } + } + fprintf(stdout, "Block access count stats: \n%s", + access_stats.ToString().c_str()); + for (auto const& bt_stats : bt_stats_map) { + fprintf(stdout, "Block access count stats for block type %s: \n%s", + block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + for (auto const& cf_bt_stats : cf_bt_stats_map) { + const std::string& cf_name = cf_bt_stats.first; + for (auto const& bt_stats : cf_bt_stats.second) { + fprintf(stdout, + "Block access count stats for column family %s and block type " + "%s: \n%s", + cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + } +} + +void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { + HistogramStat existing_keys_stats; + std::map cf_existing_keys_stats_map; + HistogramStat non_existing_keys_stats; + std::map cf_non_existing_keys_stats_map; + HistogramStat block_access_stats; + std::map cf_block_access_info; + + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + if (block_access_info.second.num_keys == 0) { + continue; + } + // Use four decimal points. + uint64_t percent_referenced_for_existing_keys = (uint64_t)( + ((double)block_access_info.second.key_num_access_map.size() / + (double)block_access_info.second.num_keys) * + 10000.0); + uint64_t percent_referenced_for_non_existing_keys = + (uint64_t)(((double)block_access_info.second + .non_exist_key_num_access_map.size() / + (double)block_access_info.second.num_keys) * + 10000.0); + uint64_t percent_accesses_for_existing_keys = (uint64_t)( + ((double) + block_access_info.second.num_referenced_key_exist_in_block / + (double)block_access_info.second.num_accesses) * + 10000.0); + existing_keys_stats.Add(percent_referenced_for_existing_keys); + cf_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_existing_keys); + non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys); + cf_non_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_non_existing_keys); + block_access_stats.Add(percent_accesses_for_existing_keys); + cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys); + } + } + } + } + fprintf(stdout, + "Histogram on percentage of referenced keys existing in a block over " + "the total number of keys in a block: \n%s", + existing_keys_stats.ToString().c_str()); + for (auto const& cf_stats : cf_existing_keys_stats_map) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + fprintf( + stdout, + "Histogram on percentage of referenced keys DO NOT exist in a block over " + "the total number of keys in a block: \n%s", + non_existing_keys_stats.ToString().c_str()); + for (auto const& cf_stats : cf_non_existing_keys_stats_map) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + fprintf(stdout, + "Histogram on percentage of accesses on keys exist in a block over " + "the total number of accesses in a block: \n%s", + block_access_stats.ToString().c_str()); + for (auto const& cf_stats : cf_block_access_info) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } +} + +void BlockCacheTraceAnalyzer::PrintStatsSummary() const { + uint64_t total_num_files = 0; + uint64_t total_num_blocks = 0; + uint64_t total_num_accesses = 0; + std::map bt_num_blocks_map; + std::map caller_num_access_map; + std::map> + caller_bt_num_access_map; + std::map> + caller_level_num_access_map; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + uint64_t cf_num_files = 0; + uint64_t cf_num_blocks = 0; + std::map cf_bt_blocks; + uint64_t cf_num_accesses = 0; + std::map cf_caller_num_accesses_map; + std::map> + cf_caller_level_num_accesses_map; + std::map> + cf_caller_file_num_accesses_map; + std::map> + cf_caller_bt_num_accesses_map; + total_num_files += cf_aggregates.second.fd_aggregates_map.size(); + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + cf_num_files++; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + cf_bt_blocks[type] += + block_type_aggregates.second.block_access_info_map.size(); + total_num_blocks += + block_type_aggregates.second.block_access_info_map.size(); + bt_num_blocks_map[type] += + block_type_aggregates.second.block_access_info_map.size(); + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + cf_num_blocks++; + for (auto const& stats : + block_access_info.second.caller_num_access_map) { + // Stats per caller. + const BlockCacheLookupCaller caller = stats.first; + const uint64_t num_accesses = stats.second; + // Overall stats. + total_num_accesses += num_accesses; + caller_num_access_map[caller] += num_accesses; + caller_bt_num_access_map[caller][type] += num_accesses; + caller_level_num_access_map[caller][level] += num_accesses; + // Column Family stats. + cf_num_accesses++; + cf_caller_num_accesses_map[caller] += num_accesses; + cf_caller_level_num_accesses_map[caller][level] += num_accesses; + cf_caller_file_num_accesses_map[caller][fd] += num_accesses; + cf_caller_bt_num_accesses_map[caller][type] += num_accesses; + } + } + } + } + + // Print stats. + fprintf( + stdout, + "***************************************************************\n"); + fprintf( + stdout, + "***************************************************************\n"); + fprintf( + stdout, + "***************************************************************\n"); + fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str()); + fprintf(stdout, + "Number of files:%" PRIu64 "Number of blocks: %" PRIu64 + "Number of accesses: %" PRIu64 "\n", + cf_num_files, cf_num_blocks, cf_num_accesses); + for (auto block_type : cf_bt_blocks) { + fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n", + block_type_to_string(block_type.first).c_str(), + block_type.second); + } + for (auto caller : cf_caller_num_accesses_map) { + fprintf( + stdout, + "***************************************************************\n"); + fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", + caller_to_string(caller.first).c_str(), caller.second); + fprintf(stdout, "Caller %s: Number of accesses per level break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_level : + cf_caller_level_num_accesses_map[caller.first]) { + fprintf(stdout, + "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 "\n", + naccess_level.first, naccess_level.second); + } + fprintf(stdout, "Caller %s: Number of accesses per file break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) { + fprintf(stdout, + "\t File %" PRIu64 ": Number of accesses: %" PRIu64 "\n", + naccess_file.first, naccess_file.second); + } + fprintf(stdout, + "Caller %s: Number of accesses per block type break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) { + fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n", + block_type_to_string(naccess_type.first).c_str(), + naccess_type.second); + } + } + } + fprintf(stdout, + "***************************************************************\n"); + fprintf(stdout, + "***************************************************************\n"); + fprintf(stdout, + "***************************************************************\n"); + fprintf(stdout, "Overall statistics:\n"); + fprintf(stdout, + "Number of files: %" PRIu64 " Number of blocks: %" PRIu64 + " Number of accesses: %" PRIu64 "\n", + total_num_files, total_num_blocks, total_num_accesses); + for (auto block_type : bt_num_blocks_map) { + fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n", + block_type_to_string(block_type.first).c_str(), block_type.second); + } + for (auto caller : caller_num_access_map) { + fprintf( + stdout, + "***************************************************************\n"); + fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", + caller_to_string(caller.first).c_str(), caller.second); + fprintf(stdout, "Caller %s: Number of accesses per level break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_level : caller_level_num_access_map[caller.first]) { + fprintf(stdout, "\t Level %d: Number of accesses: %" PRIu64 "\n", + naccess_level.first, naccess_level.second); + } + fprintf(stdout, "Caller %s: Number of accesses per block type break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_type : caller_bt_num_access_map[caller.first]) { + fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n", + block_type_to_string(naccess_type.first).c_str(), + naccess_type.second); + } + } +} + +} // namespace rocksdb diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h new file mode 100644 index 00000000000..9dde8a939b5 --- /dev/null +++ b/tools/block_cache_trace_analyzer.h @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/env.h" +#include "trace_replay/block_cache_tracer.h" + +namespace rocksdb { + +// Statistics of a block. +struct BlockAccessInfo { + uint64_t num_accesses = 0; + uint64_t block_size = 0; + uint64_t first_access_time = 0; + uint64_t last_access_time = 0; + uint64_t num_keys = 0; + std::map + key_num_access_map; // for keys exist in this block. + std::map + non_exist_key_num_access_map; // for keys do not exist in this block. + uint64_t num_referenced_key_exist_in_block = 0; + std::map caller_num_access_map; + + void AddAccess(const BlockCacheTraceRecord& access) { + if (first_access_time == 0) { + first_access_time = access.access_timestamp; + } + last_access_time = access.access_timestamp; + block_size = access.block_size; + caller_num_access_map[access.caller]++; + num_accesses++; + if (ShouldTraceReferencedKey(access)) { + num_keys = access.num_keys_in_block; + + if (access.is_referenced_key_exist_in_block == Boolean::kTrue) { + key_num_access_map[access.referenced_key]++; + num_referenced_key_exist_in_block++; + } else { + non_exist_key_num_access_map[access.referenced_key]++; + } + } + } +}; + +// Aggregates stats of a block given a block type. +struct BlockTypeAccessInfoAggregate { + std::map block_access_info_map; +}; + +// Aggregates BlockTypeAggregate given a SST file. +struct SSTFileAccessInfoAggregate { + uint32_t level; + std::map block_type_aggregates_map; +}; + +// Aggregates SSTFileAggregate given a column family. +struct ColumnFamilyAccessInfoAggregate { + std::map fd_aggregates_map; +}; + +class BlockCacheTraceAnalyzer { + public: + BlockCacheTraceAnalyzer(const std::string& trace_file_path); + ~BlockCacheTraceAnalyzer() = default; + // No copy and move. + BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete; + BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete; + + // Read all access records in the given trace_file, maintains the stats of + // a block, and aggregates the information by block type, sst file, and column + // family. Subsequently, the caller may call Print* functions to print + // statistics. + Status Analyze(); + + // Print a summary of statistics of the trace, e.g., + // Number of files: 2 Number of blocks: 50 Number of accesses: 50 + // Number of Index blocks: 10 + // Number of Filter blocks: 10 + // Number of Data blocks: 10 + // Number of UncompressionDict blocks: 10 + // Number of RangeDeletion blocks: 10 + // *************************************************************** + // Caller Get: Number of accesses 10 + // Caller Get: Number of accesses per level break down + // Level 0: Number of accesses: 10 + // Caller Get: Number of accesses per block type break down + // Block Type Index: Number of accesses: 2 + // Block Type Filter: Number of accesses: 2 + // Block Type Data: Number of accesses: 2 + // Block Type UncompressionDict: Number of accesses: 2 + // Block Type RangeDeletion: Number of accesses: 2 + void PrintStatsSummary() const; + + // Print block size distribution and the distribution break down by block type + // and column family. + void PrintBlockSizeStats() const; + + // Print access count distribution and the distribution break down by block + // type and column family. + void PrintAccessCountStats() const; + + // Print data block accesses by user Get and Multi-Get. + // It prints out 1) A histogram on the percentage of keys accessed in a data + // block break down by if a referenced key exists in the data block andthe + // histogram break down by column family. 2) A histogram on the percentage of + // accesses on keys exist in a data block and its break down by column family. + void PrintDataBlockAccessStats() const; + + const std::map& + TEST_cf_aggregates_map() const { + return cf_aggregates_map_; + } + + private: + void RecordAccess(const BlockCacheTraceRecord& access); + + rocksdb::Env* env_; + std::string trace_file_path_; + BlockCacheTraceHeader header_; + std::map cf_aggregates_map_; +}; + +} // namespace rocksdb diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc new file mode 100644 index 00000000000..96f52c1ec00 --- /dev/null +++ b/tools/block_cache_trace_analyzer_test.cc @@ -0,0 +1,229 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "tools/block_cache_trace_analyzer.h" +#include "trace_replay/block_cache_tracer.h" + +namespace rocksdb { + +namespace { +const uint64_t kBlockSize = 1024; +const std::string kBlockKeyPrefix = "test-block-"; +const uint32_t kCFId = 0; +const uint32_t kLevel = 1; +const uint64_t kSSTStoringEvenKeys = 100; +const uint64_t kSSTStoringOddKeys = 101; +const std::string kRefKeyPrefix = "test-get-"; +const uint64_t kNumKeysInBlock = 1024; +} // namespace + +class BlockCacheTracerTest : public testing::Test { + public: + BlockCacheTracerTest() { + test_path_ = test::PerThreadDBPath("block_cache_tracer_test"); + env_ = rocksdb::Env::Default(); + EXPECT_OK(env_->CreateDir(test_path_)); + trace_file_path_ = test_path_ + "/block_cache_trace"; + } + + ~BlockCacheTracerTest() override { + if (getenv("KEEP_DB")) { + printf("The trace file is still at %s\n", trace_file_path_.c_str()); + return; + } + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + EXPECT_OK(env_->DeleteDir(test_path_)); + } + + BlockCacheLookupCaller GetCaller(uint32_t key_id) { + uint32_t n = key_id % 5; + switch (n) { + case 0: + return BlockCacheLookupCaller::kPrefetch; + case 1: + return BlockCacheLookupCaller::kCompaction; + case 2: + return BlockCacheLookupCaller::kUserGet; + case 3: + return BlockCacheLookupCaller::kUserMGet; + case 4: + return BlockCacheLookupCaller::kUserIterator; + } + // This cannot happend. + assert(false); + return BlockCacheLookupCaller::kUserGet; + } + + void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id, + TraceType block_type, uint32_t nblocks) { + assert(writer); + for (uint32_t i = 0; i < nblocks; i++) { + uint32_t key_id = from_key_id + i; + BlockCacheTraceRecord record; + record.block_type = block_type; + record.block_size = kBlockSize + key_id; + record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.access_timestamp = env_->NowMicros(); + record.cf_id = kCFId; + record.cf_name = kDefaultColumnFamilyName; + record.caller = GetCaller(key_id); + record.level = kLevel; + if (key_id % 2 == 0) { + record.sst_fd_number = kSSTStoringEvenKeys; + } else { + record.sst_fd_number = kSSTStoringOddKeys; + } + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + // Provide these fields for all block types. + // The writer should only write these fields for data blocks and the + // caller is either GET or MGET. + record.referenced_key = kRefKeyPrefix + std::to_string(key_id); + record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.num_keys_in_block = kNumKeysInBlock; + ASSERT_OK(writer->WriteBlockAccess(record)); + } + } + + void AssertBlockAccessInfo( + uint32_t key_id, TraceType type, + const std::map& block_access_info_map) { + auto key_id_str = kBlockKeyPrefix + std::to_string(key_id); + ASSERT_TRUE(block_access_info_map.find(key_id_str) != + block_access_info_map.end()); + auto& block_access_info = block_access_info_map.find(key_id_str)->second; + ASSERT_EQ(1, block_access_info.num_accesses); + ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size); + ASSERT_GT(block_access_info.first_access_time, 0); + ASSERT_GT(block_access_info.last_access_time, 0); + ASSERT_EQ(1, block_access_info.caller_num_access_map.size()); + BlockCacheLookupCaller expected_caller = GetCaller(key_id); + ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) != + block_access_info.caller_num_access_map.end()); + ASSERT_EQ( + 1, + block_access_info.caller_num_access_map.find(expected_caller)->second); + + if ((expected_caller == BlockCacheLookupCaller::kUserGet || + expected_caller == BlockCacheLookupCaller::kUserMGet) && + type == TraceType::kBlockTraceDataBlock) { + ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys); + ASSERT_EQ(1, block_access_info.key_num_access_map.size()); + ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size()); + ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block); + } + } + + Env* env_; + EnvOptions env_options_; + std::string trace_file_path_; + std::string test_path_; +}; + +TEST_F(BlockCacheTracerTest, MixedBlocks) { + { + // Generate a trace file containing a mix of blocks. + // It contains two SST files with 25 blocks of odd numbered block_key in + // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in + // kSSTStoringEvenKeys. + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer)); + ASSERT_OK(writer.WriteHeader()); + // Write blocks of different types. + WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock, + 10); + WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10); + WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10); + WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10); + WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + + { + // Verify trace file is generated correctly. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, header.rocksdb_major_version); + ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version); + // Read blocks. + BlockCacheTraceAnalyzer analyzer(trace_file_path_); + // The analyzer ends when it detects an incomplete access record. + ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); + const uint64_t expected_num_cfs = 1; + std::vector expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys}; + const std::vector expected_types{ + TraceType::kBlockTraceUncompressionDictBlock, + TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock, + TraceType::kBlockTraceIndexBlock, + TraceType::kBlockTraceRangeDeletionBlock}; + const uint64_t expected_num_keys_per_type = 5; + + auto& stats = analyzer.TEST_cf_aggregates_map(); + ASSERT_EQ(expected_num_cfs, stats.size()); + ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end()); + auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second; + ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size()); + for (auto fd_id : expected_fds) { + ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) != + cf_stats.fd_aggregates_map.end()); + ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level); + auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id) + ->second.block_type_aggregates_map; + ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size()); + uint32_t key_id = 0; + for (auto type : expected_types) { + ASSERT_TRUE(block_type_aggregates_map.find(type) != + block_type_aggregates_map.end()); + auto& block_access_info_map = + block_type_aggregates_map.find(type)->second.block_access_info_map; + // Each block type has 5 blocks. + ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size()); + for (uint32_t i = 0; i < 10; i++) { + // Verify that odd numbered blocks are stored in kSSTStoringOddKeys + // and even numbered blocks are stored in kSSTStoringEvenKeys. + auto key_id_str = kBlockKeyPrefix + std::to_string(key_id); + if (fd_id == kSSTStoringOddKeys) { + if (key_id % 2 == 1) { + AssertBlockAccessInfo(key_id, type, block_access_info_map); + } else { + ASSERT_TRUE(block_access_info_map.find(key_id_str) == + block_access_info_map.end()); + } + } else { + if (key_id % 2 == 1) { + ASSERT_TRUE(block_access_info_map.find(key_id_str) == + block_access_info_map.end()); + } else { + AssertBlockAccessInfo(key_id, type, block_access_info_map); + } + } + key_id++; + } + } + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 8d0119a6891..58c7df70b20 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -15,12 +15,13 @@ namespace rocksdb { namespace { const unsigned int kCharSize = 1; +} // namespace + bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) { return (record.block_type == TraceType::kBlockTraceDataBlock) && (record.caller == BlockCacheLookupCaller::kUserGet || record.caller == BlockCacheLookupCaller::kUserMGet); } -} // namespace BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 5fd14cbf11b..e24d5a5ef35 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -77,6 +77,8 @@ struct BlockCacheTraceHeader { uint32_t rocksdb_minor_version; }; +bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record); + // BlockCacheTraceWriter captures all RocksDB block cache accesses using a // user-provided TraceWriter. Every RocksDB operation is written as a single // trace. Each trace will have a timestamp and type, followed by the trace From 7177dc46a13332c96332d524b20f14b7e1372d07 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 11 Jun 2019 13:04:59 -0700 Subject: [PATCH 130/572] Handle missing WAL in secondary mode (#5323) Summary: In secondary mode, it is possible that the secondary lists the primary's WAL directory, finds a WAL and tries to open it. It is possible that the primary deletes the WAL after secondary listing dir but before the secondary opening it. Then the secondary will fail to open the WAL file with a PathNotFound status. In this case, we can return OK without replaying WAL and optionally replay more MANIFEST. Test Plan (on my dev machine): Without this PR, the following will fail several times out of 100 runs. ``` ~/gtest-parallel/gtest-parallel -r 100 -w 16 ./db_secondary_test --gtest_filter=DBSecondaryTest.SwitchToNewManifestDuringOpen ``` With this PR, the above should always succeed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5323 Differential Revision: D15763878 Pulled By: riversand963 fbshipit-source-id: c7164fa7cb8d9001abc258b6a2dc93613e4f38ff --- db/db_impl/db_impl_secondary.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 2737df0ae8c..5cd0beb1f0c 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -60,6 +60,12 @@ Status DBImplSecondary::Recover( s = FindAndRecoverLogFiles(&cfds_changed, &job_context); } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } // TODO: update options_file_number_ needed? job_context.Clean(); @@ -475,6 +481,12 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { if (s.ok()) { s = FindAndRecoverLogFiles(&cfds_changed, &job_context); } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } if (s.ok()) { for (auto cfd : cfds_changed) { cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), From ba64a4cf52cce5cf180135e5aeddaa90b7887f9d Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 11 Jun 2019 16:19:13 -0700 Subject: [PATCH 131/572] Revert "Reduce iterator key comparison for upper/lower bound check (#5111)" (#5440) Summary: This reverts commit f3a7847598d89ef8f9f531b10fabb7ce044a38f8. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5440 Differential Revision: D15765967 Pulled By: ltamasi fbshipit-source-id: d027fe24132e3729289cd7c01857a7eb449d9dd0 --- HISTORY.md | 1 - db/db_iter.cc | 9 +---- db/version_set.cc | 40 +++++-------------- table/block_based/block_based_table_reader.cc | 26 +++++------- table/block_based/block_based_table_reader.h | 9 +---- table/internal_iterator.h | 25 +----------- table/iterator_wrapper.h | 22 ++-------- table/merging_iterator.cc | 24 ----------- 8 files changed, 28 insertions(+), 128 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ad6c370b5a0..5574c769878 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,7 +18,6 @@ * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases -* Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. ### General Improvements diff --git a/db/db_iter.cc b/db/db_iter.cc index 633724c5763..b89d7301131 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,9 +467,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); - assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || - user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); - if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && + if (iterate_upper_bound_ != nullptr && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -861,10 +859,7 @@ void DBIter::PrevInternal() { return; } - assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_lower_bound_) >= 0); - if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && + if (iterate_lower_bound_ != nullptr && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. diff --git a/db/version_set.cc b/db/version_set.cc index 8895879bfbf..658a397fa58 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -885,7 +885,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(IterateResult* result) override; + bool NextAndGetResult(Slice* ret_key) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -893,38 +893,23 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } - Slice value() const override { assert(Valid()); return file_iter_.value(); } - Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } - - inline bool MayBeOutOfLowerBound() override { - assert(Valid()); - return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); - } - - inline bool MayBeOutOfUpperBound() override { - assert(Valid()); - return file_iter_.MayBeOutOfUpperBound(); - } - void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } - bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } - bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); @@ -968,16 +953,12 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } - may_be_out_of_lower_bound_ = - read_options_.iterate_lower_bound != nullptr && - user_comparator_.Compare(ExtractUserKey(file_smallest_key(file_index_)), - *read_options_.iterate_lower_bound) < 0; return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, - nullptr /* don't need reference to table */, file_read_hist_, - for_compaction_, nullptr /* arena */, skip_filters_, level_, - smallest_compaction_key, largest_compaction_key); + nullptr /* don't need reference to table */, + file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, + level_, smallest_compaction_key, largest_compaction_key); } TableCache* table_cache_; @@ -993,7 +974,6 @@ class LevelIterator final : public InternalIterator { bool should_sample_; bool for_compaction_; bool skip_filters_; - bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1062,12 +1042,11 @@ void LevelIterator::SeekToLast() { void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(IterateResult* result) { +bool LevelIterator::NextAndGetResult(Slice* ret_key) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + *ret_key = key(); } return is_valid; } @@ -4363,9 +4342,10 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), - last_sequence_.load(), log_number, prev_log_number_, - column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); + manifest_path.c_str(), manifest_file_number_, + next_file_number_.load(), last_sequence_.load(), log_number, + prev_log_number_, column_family_set_->GetMaxColumnFamily(), + min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index d1beafed68b..75c8301c5c2 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2535,12 +2535,11 @@ void BlockBasedTableIterator::Next() { template bool BlockBasedTableIterator::NextAndGetResult( - IterateResult* result) { + Slice* ret_key) { Next(); bool is_valid = Valid(); if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + *ret_key = key(); } return is_valid; } @@ -2621,11 +2620,6 @@ void BlockBasedTableIterator::InitDataBlock() { key_includes_seq_, index_key_is_full_, /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; - if (read_options_.iterate_upper_bound != nullptr) { - data_block_within_upper_bound_ = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) > 0); - } } } @@ -2638,15 +2632,13 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - // TODO: we should be able to use !data_block_within_upper_bound_ here - // instead of performing the comparison; however, the flag can apparently - // be out of sync with the comparison in some cases. This should be - // investigated. - const bool next_block_is_out_of_bound = - read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) <= 0); + bool next_block_is_out_of_bound = false; + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + next_block_is_out_of_bound = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); + } ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index a92289f9bee..420da25932b 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -608,7 +608,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(IterateResult* result) override; + bool NextAndGetResult(Slice* ret_key) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && block_iter_points_to_real_block_ && @@ -639,11 +639,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } - inline bool MayBeOutOfUpperBound() override { - assert(Valid()); - return !data_block_within_upper_bound_; - } - void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } @@ -705,8 +700,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { TBlockIter block_iter_; bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; - // Whether current data block being fully within iterate upper bound. - bool data_block_within_upper_bound_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 1f57399c7f7..8f1cc9dd68e 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,11 +17,6 @@ namespace rocksdb { class PinnedIteratorsManager; -struct IterateResult { - Slice key; - bool may_be_out_of_upper_bound; -}; - template class InternalIteratorBase : public Cleanable { public: @@ -60,20 +55,11 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - // Moves to the next entry in the source, and return result. Iterator - // implementation should override this method to help methods inline better, - // or when MayBeOutOfUpperBound() is non-trivial. - // REQUIRES: Valid() - virtual bool NextAndGetResult(IterateResult* result) { + virtual bool NextAndGetResult(Slice* ret_key) { Next(); bool is_valid = Valid(); if (is_valid) { - result->key = key(); - // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual - // call. If an implementation has non-trivial MayBeOutOfUpperBound(), - // it should also override NextAndGetResult(). - result->may_be_out_of_upper_bound = true; - assert(MayBeOutOfUpperBound()); + *ret_key = key(); } return is_valid; } @@ -108,13 +94,6 @@ class InternalIteratorBase : public Cleanable { // upper bound virtual bool IsOutOfBound() { return false; } - // Keys return from this iterator can be smaller than iterate_lower_bound. - virtual bool MayBeOutOfLowerBound() { return true; } - - // Keys return from this iterator can be larger or equal to - // iterate_upper_bound. - virtual bool MayBeOutOfUpperBound() { return true; } - // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a5aa5c49eac..a570e53c1e2 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,10 +56,7 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { - assert(Valid()); - return result_.key; - } + Slice key() const { assert(Valid()); return key_; } TValue value() const { assert(Valid()); return iter_->value(); @@ -68,7 +65,7 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&result_); + valid_ = iter_->NextAndGetResult(&key_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } @@ -86,16 +83,6 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } - bool MayBeOutOfLowerBound() { - assert(Valid()); - return iter_->MayBeOutOfLowerBound(); - } - - bool MayBeOutOfUpperBound() { - assert(Valid()); - return result_.may_be_out_of_upper_bound; - } - void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -113,15 +100,14 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { + key_ = iter_->key(); assert(iter_->status().ok()); - result_.key = iter_->key(); - result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; - IterateResult result_; bool valid_; + Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 1a0d4df8995..207066b5a1e 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -227,16 +227,6 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } - bool NextAndGetResult(IterateResult* result) override { - Next(); - bool is_valid = Valid(); - if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); - } - return is_valid; - } - void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -306,20 +296,6 @@ class MergingIterator : public InternalIterator { return current_->value(); } - // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result - // from current child iterator. Potentially as long as one of child iterator - // report out of bound is not possible, we know current key is within bound. - - bool MayBeOutOfLowerBound() override { - assert(Valid()); - return current_->MayBeOutOfLowerBound(); - } - - bool MayBeOutOfUpperBound() override { - assert(Valid()); - return current_->MayBeOutOfUpperBound(); - } - void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) { From ca1aee2a198f8b461f4c168232ed65d9a205ce9e Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Tue, 11 Jun 2019 17:58:31 -0700 Subject: [PATCH 132/572] WriteUnprepared: commit only from the 2nd queue (#5439) Summary: This is a port of this PR into WriteUnprepared: https://github.com/facebook/rocksdb/pull/5014 This also reverts this test change to restore some flaky write unprepared tests: https://github.com/facebook/rocksdb/pull/5315 Tested with: $ gtest-parallel ./transaction_test --gtest_filter=MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/9 --repeat=128 [128/128] MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/9 (18250 ms) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5439 Differential Revision: D15761405 Pulled By: lth fbshipit-source-id: ae2581fd942d8a5b3f9278fd6bc3c1ac0b2c964c --- utilities/transactions/transaction_test.cc | 4 ++ .../transactions/write_unprepared_txn.cc | 54 ++++++++++--------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 35a9706830e..a410c5b5196 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -74,6 +74,10 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true), std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false), std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true))); #endif // ROCKSDB_VALGRIND_RUN diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index a1fe213ddd3..54d478c9466 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -281,23 +281,30 @@ Status WriteUnpreparedTxn::CommitInternal() { const bool disable_memtable = !includes_data; const bool do_one_write = !db_impl_->immutable_db_options().two_write_queues || disable_memtable; - const bool publish_seq = do_one_write; - // Note: CommitTimeWriteBatch does not need AddPrepared since it is written to - // DB in one shot. min_uncommitted still works since it requires capturing - // data that is written to DB but not yet committed, while - // CommitTimeWriteBatch commits with PreReleaseCallback. + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( - wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt, publish_seq); + wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt); + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } uint64_t seq_used = kMaxSequenceNumber; - // Since the prepared batch is directly written to memtable, there is already - // a connection between the memtable and its WAL, so there is no need to - // redundantly reference the log that contains the prepared data. + // Since the prepared batch is directly written to memtable, there is + // already a connection between the memtable and its WAL, so there is no + // need to redundantly reference the log that contains the prepared data. const uint64_t zero_log_number = 0ull; size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, zero_log_number, disable_memtable, &seq_used, - batch_cnt, &update_commit_map); + batch_cnt, pre_release_callback); assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; if (LIKELY(do_one_write || !s.ok())) { if (LIKELY(s.ok())) { // Note RemovePrepared should be called after WriteImpl that publishsed @@ -306,30 +313,25 @@ Status WriteUnpreparedTxn::CommitInternal() { wpt_db_->RemovePrepared(seq.first, seq.second); } } + if (UNLIKELY(!do_one_write)) { + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } unprep_seqs_.clear(); write_set_keys_.clear(); return s; } // else do the 2nd write to publish seq + + // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the + // commit write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[commit_batch_seq] = commit_batch_cnt; + // Note: the 2nd write comes with a performance penality. So if we have too // many of commits accompanied with ComitTimeWriteBatch and yet we cannot // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, // two_write_queues should be disabled to avoid many additional writes here. - class PublishSeqPreReleaseCallback : public PreReleaseCallback { - public: - explicit PublishSeqPreReleaseCallback(DBImpl* db_impl) - : db_impl_(db_impl) {} - Status Callback(SequenceNumber seq, - bool is_mem_disabled __attribute__((__unused__)), uint64_t, - size_t /*index*/, size_t /*total*/) override { - assert(is_mem_disabled); - assert(db_impl_->immutable_db_options().two_write_queues); - db_impl_->SetLastPublishedSequence(seq); - return Status::OK(); - } - private: - DBImpl* db_impl_; - } publish_seq_callback(db_impl_); + // Update commit map only from the 2nd queue WriteBatch empty_batch; empty_batch.PutLogData(Slice()); // In the absence of Prepare markers, use Noop as a batch separator @@ -339,7 +341,7 @@ Status WriteUnpreparedTxn::CommitInternal() { const uint64_t NO_REF_LOG = 0; s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, - &publish_seq_callback); + &update_commit_map); assert(!s.ok() || seq_used != kMaxSequenceNumber); // Note RemovePrepared should be called after WriteImpl that publishsed the // seq. Otherwise SmallestUnCommittedSeq optimization breaks. From 773f914a40a0b9901b32f28d738d6f8eb97bb0b9 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Tue, 11 Jun 2019 19:52:08 -0700 Subject: [PATCH 133/572] WritePrepared: switch PreparedHeap from priority_queue to deque (#5436) Summary: Internally PreparedHeap is currently using a priority_queue. The rationale was the in the initial design PreparedHeap::AddPrepared could be called in arbitrary order. With the recent optimizations, we call ::AddPrepared only from the main write queue, which results into in-order insertion into PreparedHeap. The patch thus replaces the underlying priority_queue with a more efficient deque implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5436 Differential Revision: D15752147 Pulled By: maysamyabandeh fbshipit-source-id: e6960f2b2097e13137dded1ceeff3b10b03b0aeb --- .../write_prepared_transaction_test.cc | 112 +++++++++++------- .../transactions/write_prepared_txn_db.cc | 7 ++ .../transactions/write_prepared_txn_db.h | 31 +++-- .../transactions/write_unprepared_txn_db.cc | 13 +- 4 files changed, 102 insertions(+), 61 deletions(-) diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 88f4ea032a9..7830cbd75fc 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -48,18 +48,21 @@ using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat; TEST(PreparedHeap, BasicsTest) { WritePreparedTxnDB::PreparedHeap heap; - heap.push(14l); - // Test with one element - ASSERT_EQ(14l, heap.top()); - heap.push(24l); - heap.push(34l); - // Test that old min is still on top - ASSERT_EQ(14l, heap.top()); - heap.push(44l); - heap.push(54l); - heap.push(64l); - heap.push(74l); - heap.push(84l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(14l); + // Test with one element + ASSERT_EQ(14l, heap.top()); + heap.push(24l); + heap.push(34l); + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.push(44l); + heap.push(54l); + heap.push(64l); + heap.push(74l); + heap.push(84l); + } // Test that old min is still on top ASSERT_EQ(14l, heap.top()); heap.erase(24l); @@ -81,11 +84,14 @@ TEST(PreparedHeap, BasicsTest) { ASSERT_EQ(64l, heap.top()); heap.erase(84l); ASSERT_EQ(64l, heap.top()); - heap.push(85l); - heap.push(86l); - heap.push(87l); - heap.push(88l); - heap.push(89l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(85l); + heap.push(86l); + heap.push(87l); + heap.push(88l); + heap.push(89l); + } heap.erase(87l); heap.erase(85l); heap.erase(89l); @@ -106,13 +112,19 @@ TEST(PreparedHeap, BasicsTest) { // not resurface again. TEST(PreparedHeap, EmptyAtTheEnd) { WritePreparedTxnDB::PreparedHeap heap; - heap.push(40l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } ASSERT_EQ(40l, heap.top()); // Although not a recommended scenario, we must be resilient against erase // without a prior push. heap.erase(50l); ASSERT_EQ(40l, heap.top()); - heap.push(60l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } ASSERT_EQ(40l, heap.top()); heap.erase(60l); @@ -120,11 +132,17 @@ TEST(PreparedHeap, EmptyAtTheEnd) { heap.erase(40l); ASSERT_TRUE(heap.empty()); - heap.push(40l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } ASSERT_EQ(40l, heap.top()); heap.erase(50l); ASSERT_EQ(40l, heap.top()); - heap.push(60l); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } ASSERT_EQ(40l, heap.top()); heap.erase(40l); @@ -139,30 +157,37 @@ TEST(PreparedHeap, EmptyAtTheEnd) { // successfully emptied at the end. TEST(PreparedHeap, Concurrent) { const size_t t_cnt = 10; - rocksdb::port::Thread t[t_cnt]; - Random rnd(1103); + rocksdb::port::Thread t[t_cnt + 1]; WritePreparedTxnDB::PreparedHeap heap; port::RWMutex prepared_mutex; + std::atomic last; for (size_t n = 0; n < 100; n++) { - for (size_t i = 0; i < t_cnt; i++) { - // This is not recommended usage but we should be resilient against it. - bool skip_push = rnd.OneIn(5); - t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, skip_push, i]() { - auto seq = i; - std::this_thread::yield(); + last = 0; + t[0] = rocksdb::port::Thread([&heap, t_cnt, &last]() { + Random rnd(1103); + for (size_t seq = 1; seq <= t_cnt; seq++) { + // This is not recommended usage but we should be resilient against it. + bool skip_push = rnd.OneIn(5); if (!skip_push) { - WriteLock wl(&prepared_mutex); + MutexLock ml(heap.push_pop_mutex()); + std::this_thread::yield(); heap.push(seq); + last.store(seq); } - std::this_thread::yield(); - { - WriteLock wl(&prepared_mutex); - heap.erase(seq); - } + } + }); + for (size_t i = 1; i <= t_cnt; i++) { + t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, &last, i]() { + auto seq = i; + do { + std::this_thread::yield(); + } while (last.load() < seq); + WriteLock wl(&prepared_mutex); + heap.erase(seq); }); } - for (size_t i = 0; i < t_cnt; i++) { + for (size_t i = 0; i <= t_cnt; i++) { t[i].join(); } ASSERT_TRUE(heap.empty()); @@ -3197,7 +3222,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { ReOpen(); std::atomic snap = {nullptr}; std::atomic exp_prepare = {0}; - std::atomic snapshot_taken = {false}; + rocksdb::port::Thread callback_thread; // Value is synchronized via snap PinnableSlice value; // Take a snapshot after publish and before RemovePrepared:Start @@ -3208,7 +3233,6 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { roptions.snapshot = snap.load(); auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value); ASSERT_OK(s); - snapshot_taken.store(true); }; auto callback = [&](void* param) { SequenceNumber prep_seq = *((SequenceNumber*)param); @@ -3216,8 +3240,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { // We need to spawn a thread to avoid deadlock since getting a // snpashot might end up calling AdvanceSeqByOne which needs joining // the write queue. - auto t = rocksdb::port::Thread(snap_callback); - t.detach(); + callback_thread = rocksdb::port::Thread(snap_callback); TEST_SYNC_POINT("callback:end"); } }; @@ -3250,15 +3273,12 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { // Let an eviction to kick in std::this_thread::yield(); - snapshot_taken.store(false); exp_prepare.store(txn->GetId()); ASSERT_OK(txn->Commit()); delete txn; // Wait for the snapshot taking that is triggered by // RemovePrepared:Start callback - while (!snapshot_taken) { - std::this_thread::yield(); - } + callback_thread.join(); // Read with the snapshot taken before delayed_prepared_ cleanup ReadOptions roptions; @@ -3278,9 +3298,9 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { }); write_thread.join(); eviction_thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); } - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); } } diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 96e1aa7a7ba..a3b523a22cf 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -32,12 +32,19 @@ Status WritePreparedTxnDB::Initialize( auto dbimpl = reinterpret_cast(GetRootDB()); assert(dbimpl != nullptr); auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; for (auto rtxn : rtxns) { // There should only one batch for WritePrepared policy. assert(rtxn.second->batches_.size() == 1); const auto& seq = rtxn.second->batches_.begin()->first; const auto& batch_info = rtxn.second->batches_.begin()->second; auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + ordered_seq_cnt[seq] = cnt; + } + // AddPrepared must be called in order + for (auto seq_cnt: ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; for (size_t i = 0; i < cnt; i++) { AddPrepared(seq + i); } diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index acf2b97a99d..9561bfada17 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -511,9 +511,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // The mutex is required for push and pop from PreparedHeap. ::erase will // use external synchronization via prepared_mutex_. port::Mutex push_pop_mutex_; - // TODO(myabandeh): replace it with deque - std::priority_queue, std::greater> - heap_; + std::deque heap_; std::priority_queue, std::greater> erased_heap_; std::atomic heap_top_ = {kMaxSequenceNumber}; @@ -534,21 +532,27 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // Returns kMaxSequenceNumber if empty() and the smallest otherwise. inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); } inline void push(uint64_t v) { - heap_.push(v); - heap_top_.store(heap_.top(), std::memory_order_release); + push_pop_mutex_.AssertHeld(); + if (heap_.empty()) { + heap_top_.store(v, std::memory_order_release); + } else { + assert(heap_top_.load() < v); + } + heap_.push_back(v); } void pop(bool locked = false) { if (!locked) { push_pop_mutex()->Lock(); } - heap_.pop(); + push_pop_mutex_.AssertHeld(); + heap_.pop_front(); while (!heap_.empty() && !erased_heap_.empty() && // heap_.top() > erased_heap_.top() could happen if we have erased // a non-existent entry. Ideally the user should not do that but we // should be resilient against it. - heap_.top() >= erased_heap_.top()) { - if (heap_.top() == erased_heap_.top()) { - heap_.pop(); + heap_.front() >= erased_heap_.top()) { + if (heap_.front() == erased_heap_.top()) { + heap_.pop_front(); } uint64_t erased __attribute__((__unused__)); erased = erased_heap_.top(); @@ -559,7 +563,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { while (heap_.empty() && !erased_heap_.empty()) { erased_heap_.pop(); } - heap_top_.store(!heap_.empty() ? heap_.top() : kMaxSequenceNumber, + heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber, std::memory_order_release); if (!locked) { push_pop_mutex()->Unlock(); @@ -568,13 +572,16 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // Concurrrent calls needs external synchronization. It is safe to be called // concurrent to push and pop though. void erase(uint64_t seq) { - if (!heap_.empty()) { + if (!empty()) { auto top_seq = top(); if (seq < top_seq) { // Already popped, ignore it. } else if (top_seq == seq) { pop(); - assert(heap_.empty() || heap_.top() != seq); +#ifndef NDEBUG + MutexLock ml(push_pop_mutex()); + assert(heap_.empty() || heap_.front() != seq); +#endif } else { // top() > seq // Down the heap, remember to pop it later erased_heap_.push(seq); diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 0c94183947f..9382edfad2b 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -225,6 +225,7 @@ Status WriteUnpreparedTxnDB::Initialize( // create 'real' transactions from recovered shell transactions auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; for (auto rtxn : rtxns) { auto recovered_trx = rtxn.second; assert(recovered_trx); @@ -266,9 +267,7 @@ Status WriteUnpreparedTxnDB::Initialize( auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; assert(batch_info.log_number_); - for (size_t i = 0; i < cnt; i++) { - AddPrepared(seq + i); - } + ordered_seq_cnt[seq] = cnt; assert(wupt->unprep_seqs_.count(seq) == 0); wupt->unprep_seqs_[seq] = cnt; KeySetBuilder keyset_handler(wupt, @@ -288,6 +287,14 @@ Status WriteUnpreparedTxnDB::Initialize( break; } } + // AddPrepared must be called in order + for (auto seq_cnt: ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } SequenceNumber prev_max = max_evicted_seq_; SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); From 4a285d0dd318985b99a88318c96514fd738aa1e6 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 12 Jun 2019 09:42:49 -0700 Subject: [PATCH 134/572] Remove passing const variable to thread (#5443) Summary: CLANG complains that passing const to thread is not necessary. The patch removes it form PreparedHeap::Concurrent test. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5443 Differential Revision: D15781598 Pulled By: maysamyabandeh fbshipit-source-id: 3aceb05d96182fa4726d6d37eed45fd3aac4c016 --- utilities/transactions/write_prepared_transaction_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 7830cbd75fc..66ea8fa530f 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -164,7 +164,7 @@ TEST(PreparedHeap, Concurrent) { for (size_t n = 0; n < 100; n++) { last = 0; - t[0] = rocksdb::port::Thread([&heap, t_cnt, &last]() { + t[0] = rocksdb::port::Thread([&heap, &last]() { Random rnd(1103); for (size_t seq = 1; seq <= t_cnt; seq++) { // This is not recommended usage but we should be resilient against it. From f43edff9ac78f8f08edc15092f9e08d4bea10282 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 12 Jun 2019 10:29:56 -0700 Subject: [PATCH 135/572] Disable kPipelinedWrite in MultiThreaded (#5442) Summary: TSAN tests report a race condition. We temporarily exclude kPipelinedWrite from MultiThreaded until the race condition is fixed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5442 Differential Revision: D15782349 Pulled By: maysamyabandeh fbshipit-source-id: 42b4f9b3fa9137f0675e13ad132c0a06800c1bdd --- db/db_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/db/db_test.cc b/db/db_test.cc index a27a5eeb97f..3bac53f2f0a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2285,6 +2285,7 @@ class MultiThreadedDBTest }; TEST_P(MultiThreadedDBTest, MultiThreaded) { + if (option_config_ == kPipelinedWrite) return; anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; Options options = CurrentOptions(options_override); From f9842869cf2dc2278322a4f00ccb45a978c7a923 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 12 Jun 2019 11:09:02 -0700 Subject: [PATCH 136/572] Disable pipeline writes in stress test (#5445) Summary: The tsan crash tests are failing with a data race compliant with pipelined write option. Temporarily disable it until its concurrency issue are fixed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5445 Differential Revision: D15783824 Pulled By: maysamyabandeh fbshipit-source-id: 413a0c3230b86f524fc7eeea2cf8e8375406e65b --- tools/db_crashtest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6487562d8bb..173a6a8da9c 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -37,7 +37,8 @@ "delpercent": 4, "delrangepercent": 1, "destroy_db_initially": 0, - "enable_pipelined_write": lambda: random.randint(0, 1), + # Temporarily disable it until its concurrency issue are fixed + "enable_pipelined_write": 0, "expected_values_path": expected_values_file.name, "flush_one_in": 1000000, "max_background_compactions": 20, From 60f3ec2ca57796203c880d494c872f0086768ce2 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 12 Jun 2019 15:00:53 -0700 Subject: [PATCH 137/572] Fix appveyor compliant about passing const to thread (#5447) Summary: CLANG would complain if we pass const to lambda function and appveyor complains if we don't (https://github.com/facebook/rocksdb/pull/5443). The patch fixes that by using the default capture mode. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5447 Differential Revision: D15788722 Pulled By: maysamyabandeh fbshipit-source-id: 47e7f49264afe31fdafe42cb8bf93da126abfca9 --- utilities/transactions/write_prepared_transaction_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 66ea8fa530f..ef89aaeb8c7 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -164,7 +164,7 @@ TEST(PreparedHeap, Concurrent) { for (size_t n = 0; n < 100; n++) { last = 0; - t[0] = rocksdb::port::Thread([&heap, &last]() { + t[0] = rocksdb::port::Thread([&]() { Random rnd(1103); for (size_t seq = 1; seq <= t_cnt; seq++) { // This is not recommended usage but we should be resilient against it. From 5c76ba9dc4cbc676d8a28264b15af68c1bf06917 Mon Sep 17 00:00:00 2001 From: Patrick Zhang Date: Thu, 13 Jun 2019 11:43:35 -0700 Subject: [PATCH 138/572] Support rocksdbjava aarch64 build and test (#5258) Summary: Verified with an Ampere Computing eMAG aarch64 system. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5258 Differential Revision: D15807309 Pulled By: maysamyabandeh fbshipit-source-id: ab85d2fd3fe40e6094430ab0eba557b1e979510d --- Makefile | 5 ++++- build_tools/build_detect_platform | 2 ++ .../java/org/rocksdb/util/Environment.java | 6 +++++- .../java/org/rocksdb/util/EnvironmentTest.java | 18 ++++++++++++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 425c75eb5f5..5944325aafe 100644 --- a/Makefile +++ b/Makefile @@ -1641,7 +1641,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) else ifeq ($(PLATFORM), OS_OPENBSD) - ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64)) + ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64 aarch64)) ARCH := 64 else ARCH := 32 @@ -1655,6 +1655,9 @@ ifeq (,$(findstring ppc,$(MACHINE))) else ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so endif +ifneq (,$(findstring aarch64,$(MACHINE))) + ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so +endif ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 5d42faa30ae..ac30f9ab0fa 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -540,6 +540,8 @@ if test -z "$PORTABLE"; then elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then # TODO: Handle this with approprite options. COMMON_FLAGS="$COMMON_FLAGS" + elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then + COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" == "IOS" ]; then COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index c019266483f..03611a248a6 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -5,6 +5,10 @@ public class Environment { private static String OS = System.getProperty("os.name").toLowerCase(); private static String ARCH = System.getProperty("os.arch").toLowerCase(); + public static boolean isAarch64() { + return ARCH.contains("aarch64"); + } + public static boolean isPowerPC() { return ARCH.contains("ppc"); } @@ -60,7 +64,7 @@ public static String getSharedLibraryFileName(final String name) { public static String getJniLibraryName(final String name) { if (isUnix()) { final String arch = is64Bit() ? "64" : "32"; - if(isPowerPC()) { + if(isPowerPC() || isAarch64()) { return String.format("%sjni-linux-%s", name, ARCH); } else if(isS390x()) { return String.format("%sjni-linux%s", name, ARCH); diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index 28ee04768e9..49c8bf19a91 100644 --- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -130,6 +130,24 @@ public void win64() { isEqualTo("librocksdbjni.dll"); } + @Test + public void aarch64() { + setEnvironmentClassFields("Linux", "aarch64"); + assertThat(Environment.isUnix()).isTrue(); + assertThat(Environment.isAarch64()).isTrue(); + assertThat(Environment.is64Bit()).isTrue(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getSharedLibraryName("rocksdb")). + isEqualTo("rocksdbjni"); + assertThat(Environment.getJniLibraryName("rocksdb")). + isEqualTo("rocksdbjni-linux-aarch64"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux-aarch64.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + } + private void setEnvironmentClassFields(String osName, String osArch) { setEnvironmentClassField(OS_FIELD_NAME, osName); From ec8111c5a4eb8669c097e55a75bd54f2e8c6db81 Mon Sep 17 00:00:00 2001 From: Bin Fan Date: Thu, 13 Jun 2019 12:20:30 -0700 Subject: [PATCH 139/572] Add Alluxio to USERS.md (#5434) Summary: Add Alluxio's use case of RocksDB to `USERS.md` for metadata service Pull Request resolved: https://github.com/facebook/rocksdb/pull/5434 Differential Revision: D15766559 Pulled By: riversand963 fbshipit-source-id: b68ef851f8f92e0925c31e55296260225fdf849e --- USERS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/USERS.md b/USERS.md index a95903f0662..6401757d2bd 100644 --- a/USERS.md +++ b/USERS.md @@ -50,6 +50,10 @@ Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santande ## Airbnb Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs +## Alluxio +[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog: +https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/ + ## Pinterest Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo @@ -91,4 +95,4 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed [ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files. ## IOTA Foundation - [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. \ No newline at end of file + [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. From 2c9df9f9e5c757c8f368d0860e2da8adb63849a3 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 13 Jun 2019 13:52:43 -0700 Subject: [PATCH 140/572] Dynamic test whether sync_file_range returns ENOSYS (#5416) Summary: `sync_file_range` returns `ENOSYS` on Windows Subsystem for Linux even when using a supposedly supported filesystem like ext4. To handle this case we can do a dynamic check that a no-op `sync_file_range` invocation, which is accomplished by passing zero for the `flags` argument, succeeds. Also I rearranged the function and comments to hopefully make it more easily understandable. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5416 Differential Revision: D15807061 fbshipit-source-id: d31d94e1f228b7850ea500e6199f8b5daf8cfbd3 --- env/io_posix.cc | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/env/io_posix.cc b/env/io_posix.cc index 8b42a636295..304c4ffe1c7 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -186,28 +186,34 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { #define ZFS_SUPER_MAGIC 0x2fc12fc1 #endif -bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) { - // `fstatfs` is only available on Linux, but so is `sync_file_range`, so - // `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`. +bool IsSyncFileRangeSupported(int fd) { + // The approach taken in this function is to build a blacklist of cases where + // we know `sync_file_range` definitely will not work properly despite passing + // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or + // if any of the checks fail in unexpected ways, we allow `sync_file_range` to + // be used. This way should minimize risk of impacting existing use cases. struct statfs buf; int ret = fstatfs(fd, &buf); assert(ret == 0); - if (ret != 0) { - // We don't know whether the filesystem properly supports `sync_file_range`. - // Even if it doesn't, we don't know of any safety issue with trying to call - // it anyways. So, to preserve the same behavior as before this `fstatfs` - // check was introduced, we assume `sync_file_range` is usable. - return true; - } - if (buf.f_type == ZFS_SUPER_MAGIC) { + if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { // Testing on ZFS showed the writeback did not happen asynchronously when // `sync_file_range` was called, even though it returned success. Avoid it // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, // even though this'll incur extra I/O for metadata. return false; } - // No known problems with other filesystems' implementations of - // `sync_file_range`, so allow them to use it. + + ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); + assert(!(ret == -1 && errno != ENOSYS)); + if (ret == -1 && errno == ENOSYS) { + // `sync_file_range` is not implemented on all platforms even if + // compile-time checks pass and a supported filesystem is in-use. For + // example, using ext4 on WSL (Windows Subsystem for Linux), + // `sync_file_range()` returns `ENOSYS` + // ("Function not implemented"). + return false; + } + // None of the cases on the blacklist matched, so allow `sync_file_range` use. return true; } From a3b8c76d8e3f2a849d354280e9baaac6728a8b4d Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 13 Jun 2019 14:38:54 -0700 Subject: [PATCH 141/572] Add missing check before calling PurgeObsoleteFiles in EnableFileDeletions (#5448) Summary: Calling PurgeObsoleteFiles with a JobContext for which HaveSomethingToDelete is false is a precondition violation. This would trigger an assertion in debug builds; however, in release builds with assertions disabled, this can result in the pending_purge_obsolete_files_ counter in DBImpl underflowing, which in turn can lead to the process hanging during database close. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5448 Differential Revision: D15792569 Pulled By: ltamasi fbshipit-source-id: 82d92c9b4f6a9efcdc69dbb3d5a52a1ae2dd2472 --- db/db_filesnapshot.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 3ff7c73f4e8..67d994f5568 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -57,7 +57,9 @@ Status DBImpl::EnableFileDeletions(bool force) { } if (file_deletion_enabled) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); - PurgeObsoleteFiles(job_context); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", From bb4178066dc4f18b9b7f1d371e641db027b3edbe Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 13 Jun 2019 15:39:52 -0700 Subject: [PATCH 142/572] Integrate block cache tracer into db_impl (#5433) Summary: This PR integrates the block cache tracer class into db_impl.cc. db_impl.cc contains a member variable of AtomicBlockCacheTraceWriter class and passes its reference to the block_based_table_reader. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5433 Differential Revision: D15728016 Pulled By: HaoyuHuang fbshipit-source-id: 23d5659e8c82d556833dcc1a5558aac8c1f7db71 --- TARGETS | 13 +++ db/column_family.cc | 20 ++-- db/column_family.h | 14 ++- db/compaction/compaction_job_test.cc | 6 +- db/db_impl/db_impl.cc | 14 ++- db/db_impl/db_impl.h | 11 +- db/db_wal_test.cc | 3 +- db/flush_job_test.cc | 3 +- db/memtable_list_test.cc | 4 +- db/repair.cc | 6 +- db/table_cache.cc | 8 +- db/table_cache.h | 5 +- db/version_set.cc | 18 ++-- db/version_set.h | 6 +- db/version_set_test.cc | 3 +- db/wal_manager_test.cc | 3 +- include/rocksdb/db.h | 11 ++ include/rocksdb/utilities/stackable_db.h | 10 ++ .../block_based/block_based_table_factory.cc | 3 +- table/block_based/block_based_table_reader.cc | 27 +++-- table/block_based/block_based_table_reader.h | 8 +- .../partitioned_filter_block_test.cc | 3 +- table/table_builder.h | 14 ++- tools/ldb_cmd.cc | 6 +- trace_replay/block_cache_tracer.cc | 66 +++++++++--- trace_replay/block_cache_tracer.h | 35 +++++- trace_replay/block_cache_tracer_test.cc | 102 ++++++++++++++++++ 27 files changed, 341 insertions(+), 81 deletions(-) diff --git a/TARGETS b/TARGETS index 0cdd3b162f9..7a8bb000596 100644 --- a/TARGETS +++ b/TARGETS @@ -222,6 +222,7 @@ cpp_library( "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", + "trace_replay/block_cache_tracer.cc", "trace_replay/trace_replay.cc", "util/bloom.cc", "util/build_version.cc", @@ -314,6 +315,7 @@ cpp_library( "test_util/fault_injection_test_env.cc", "test_util/testharness.cc", "test_util/testutil.cc", + "tools/block_cache_trace_analyzer.cc", "tools/trace_analyzer_tool.cc", "utilities/cassandra/test_utils.cc", ], @@ -329,6 +331,7 @@ cpp_library( name = "rocksdb_tools_lib", srcs = [ "test_util/testutil.cc", + "tools/block_cache_trace_analyzer.cc", "tools/db_bench_tool.cc", "tools/trace_analyzer_tool.cc", ], @@ -383,6 +386,16 @@ ROCKS_TESTS = [ "table/block_based/block_based_filter_block_test.cc", "serial", ], + [ + "block_cache_trace_analyzer_test", + "tools/block_cache_trace_analyzer_test.cc", + "serial", + ], + [ + "block_cache_tracer_test", + "trace_replay/block_cache_tracer_test.cc", + "serial", + ], [ "block_test", "table/block_based/block_test.cc", diff --git a/db/column_family.cc b/db/column_family.cc index 2a2e6cb980f..e135c2d317f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -405,7 +405,8 @@ ColumnFamilyData::ColumnFamilyData( uint32_t id, const std::string& name, Version* _dummy_versions, Cache* _table_cache, WriteBufferManager* write_buffer_manager, const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, - const EnvOptions& env_options, ColumnFamilySet* column_family_set) + const EnvOptions& env_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer) : id_(id), name_(name), dummy_versions_(_dummy_versions), @@ -445,7 +446,8 @@ ColumnFamilyData::ColumnFamilyData( if (_dummy_versions != nullptr) { internal_stats_.reset( new InternalStats(ioptions_.num_levels, db_options.env, this)); - table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache)); + table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache, + block_cache_tracer)); if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); @@ -1254,18 +1256,20 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller) + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) : max_column_family_(0), - dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr, - ColumnFamilyOptions(), *db_options, - env_options, nullptr)), + dummy_cfd_(new ColumnFamilyData( + 0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, + env_options, nullptr, block_cache_tracer)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), env_options_(env_options), table_cache_(table_cache), write_buffer_manager_(write_buffer_manager), - write_controller_(write_controller) { + write_controller_(write_controller), + block_cache_tracer_(block_cache_tracer) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; @@ -1333,7 +1337,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, options, - *db_options_, env_options_, this); + *db_options_, env_options_, this, block_cache_tracer_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); diff --git a/db/column_family.h b/db/column_family.h index 8646b4fc197..8180f0be26a 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -24,6 +24,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" +#include "trace_replay/block_cache_tracer.h" #include "util/thread_local.h" namespace rocksdb { @@ -46,7 +47,7 @@ struct SuperVersionContext; extern const double kIncSlowdownRatio; // This file contains a list of data structures for managing column family -// level metadata. +// level metadata. // // The basic relationships among classes declared here are illustrated as // following: @@ -94,7 +95,7 @@ extern const double kIncSlowdownRatio; // | | | 1.a | | 1.b | | 1.c | // +-------------+ | | | | | | // +----------+ +----------+ +----------+ -// +// // DBImpl keeps a ColumnFamilySet, which references to all column families by // pointing to respective ColumnFamilyData object of each column family. // This is how DBImpl can list and operate on all the column families. @@ -151,7 +152,7 @@ extern const double kIncSlowdownRatio; // contains Version B, memtable a and memtable b; SuperVersion1 contains // Version B and memtable b (mutable). As a result, Version B and memtable b // are prevented from being destroyed or deleted. - + // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client // is done using the column family @@ -504,7 +505,8 @@ class ColumnFamilyData { const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options, const EnvOptions& env_options, - ColumnFamilySet* column_family_set); + ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer); uint32_t id_; const std::string name_; @@ -632,7 +634,8 @@ class ColumnFamilySet { const ImmutableDBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller); + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -691,6 +694,7 @@ class ColumnFamilySet { Cache* table_cache_; WriteBufferManager* write_buffer_manager_; WriteController* write_controller_; + BlockCacheTracer* const block_cache_tracer_; }; // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 66c3353fcf6..add4911891a 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -77,7 +77,8 @@ class CompactionJobTest : public testing::Test { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), shutting_down_(false), preserve_deletes_seqnum_(0), mock_table_factory_(new mock::MockTableFactory()), @@ -200,7 +201,8 @@ class CompactionJobTest : public testing::Test { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)); + &write_controller_, + /*block_cache_tracer=*/nullptr)); compaction_job_stats_.Reset(); VersionEdit new_db; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 27d48539c35..af39b5ca11d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -237,7 +237,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_, table_cache_.get(), write_buffer_manager_, - &write_controller_)); + &write_controller_, &block_cache_tracer_)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); @@ -3924,6 +3924,18 @@ Status DBImpl::EndTrace() { return s; } +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(env_, trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { Status s; if (tracer_) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 4de15f0324d..942c36ff6e6 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -40,7 +40,6 @@ #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "db/memtable_list.h" #include "logging/event_logger.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" @@ -53,6 +52,7 @@ #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" +#include "trace_replay/block_cache_tracer.h" #include "trace_replay/trace_replay.h" #include "util/autovector.h" #include "util/hash.h" @@ -331,6 +331,14 @@ class DBImpl : public DB { using DB::EndTrace; virtual Status EndTrace() override; + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables( ColumnFamilyHandle* column_family, @@ -832,6 +840,7 @@ class DBImpl : public DB { recovered_transactions_; std::unique_ptr tracer_; InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; // State below is protected by mutex_ // With two_write_queues enabled, some of the variables that accessed during diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 9a1382e98ab..4859bdc90f4 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -838,7 +838,8 @@ class RecoveryTestHelper { versions.reset(new VersionSet(test->dbname_, &db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller)); + &write_controller, + /*block_cache_tracer=*/nullptr)); wal_manager.reset(new WalManager(db_options, env_options)); diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index ef89199c98e..130179ae67b 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -35,7 +35,8 @@ class FlushJobTest : public testing::Test { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()) { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index f55fbdc501a..3a14b6830a6 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -100,7 +100,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller); + &write_controller, /*block_cache_tracer=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -144,7 +144,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller); + &write_controller, /*block_cache_tracer=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); diff --git a/db/repair.cc b/db/repair.cc index 6967a46e36c..3ae46c6e7ee 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -109,11 +109,13 @@ class Repairer { // once. NewLRUCache(10, db_options_.table_cache_numshardbits)), table_cache_(new TableCache(default_cf_iopts_, env_options_, - raw_table_cache_.get())), + raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr)), wb_(db_options_.db_write_buffer_size), wc_(db_options_.delayed_write_rate), vset_(dbname_, &immutable_db_options_, env_options_, - raw_table_cache_.get(), &wb_, &wc_), + raw_table_cache_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr), next_file_number_(1), db_lock_(nullptr) { for (const auto& cfd : column_families) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 14c0169c11a..0a152f89a16 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -68,11 +68,13 @@ void AppendVarint64(IterKey* key, uint64_t v) { } // namespace TableCache::TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, Cache* const cache) + const EnvOptions& env_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer) : ioptions_(ioptions), env_options_(env_options), cache_(cache), - immortal_tables_(false) { + immortal_tables_(false), + block_cache_tracer_(block_cache_tracer) { if (ioptions_.row_cache) { // If the same cache is shared by multiple instances, we need to // disambiguate its entries. @@ -125,7 +127,7 @@ Status TableCache::GetTableReader( s = ioptions_.table_factory->NewTableReader( TableReaderOptions(ioptions_, prefix_extractor, env_options, internal_comparator, skip_filters, immortal_tables_, - level, fd.largest_seqno), + level, fd.largest_seqno, block_cache_tracer_), std::move(file_reader), fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); diff --git a/db/table_cache.h b/db/table_cache.h index 64d7b898b22..1577cef82ff 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -23,6 +23,7 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/table_reader.h" +#include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -48,7 +49,8 @@ class HistogramImpl; class TableCache { public: TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& storage_options, Cache* cache); + const EnvOptions& storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -188,6 +190,7 @@ class TableCache { Cache* const cache_; std::string row_cache_id_; bool immortal_tables_; + BlockCacheTracer* const block_cache_tracer_; }; } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 658a397fa58..30fc744c98a 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3342,10 +3342,11 @@ VersionSet::VersionSet(const std::string& dbname, const ImmutableDBOptions* _db_options, const EnvOptions& storage_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller) - : column_family_set_( - new ColumnFamilySet(dbname, _db_options, storage_options, table_cache, - write_buffer_manager, write_controller)), + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) + : column_family_set_(new ColumnFamilySet( + dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, block_cache_tracer)), env_(_db_options->env), dbname_(dbname), db_options_(_db_options), @@ -3359,7 +3360,8 @@ VersionSet::VersionSet(const std::string& dbname, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - env_options_(storage_options) {} + env_options_(storage_options), + block_cache_tracer_(block_cache_tracer) {} void CloseTables(void* ptr, size_t) { TableReader* table_reader = reinterpret_cast(ptr); @@ -4445,7 +4447,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, options->table_cache_numshardbits)); WriteController wc(options->delayed_write_rate); WriteBufferManager wb(options->db_write_buffer_size); - VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc); + VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); Status status; std::vector dummy; @@ -5200,7 +5203,8 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, WriteBufferManager* write_buffer_manager, WriteController* write_controller) : VersionSet(dbname, _db_options, _env_options, table_cache, - write_buffer_manager, write_controller) {} + write_buffer_manager, write_controller, + /*block_cache_tracer=*/nullptr) {} ReactiveVersionSet::~ReactiveVersionSet() {} diff --git a/db/version_set.h b/db/version_set.h index 8a43b982366..90be94a789a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -46,6 +46,7 @@ #include "rocksdb/env.h" #include "table/get_context.h" #include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -777,7 +778,8 @@ class VersionSet { VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller); + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); virtual ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -1125,6 +1127,8 @@ class VersionSet { // env options for all reads and writes except compactions EnvOptions env_options_; + BlockCacheTracer* const block_cache_tracer_; + private: // No copying allowed VersionSet(const VersionSet&); diff --git a/db/version_set_test.cc b/db/version_set_test.cc index bf9ef8e39fe..a1278bfc7ad 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -618,7 +618,8 @@ class VersionSetTestBase { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), reactive_versions_(std::make_shared( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_)), diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 3657fb691be..1bc6a8afe83 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -50,7 +50,8 @@ class WalManagerTest : public testing::Test { versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)); + &write_controller_, + /*block_cache_tracer=*/nullptr)); wal_manager_.reset(new WalManager(db_options_, env_options_)); } diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index b0538433b4a..3a32d6f82bd 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1317,6 +1317,17 @@ class DB { virtual Status EndTrace() { return Status::NotSupported("EndTrace() is not implemented."); } + + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. + virtual Status StartBlockCacheTrace( + const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status EndBlockCacheTrace() { + return Status::NotSupported("EndBlockCacheTrace() is not implemented."); + } #endif // ROCKSDB_LITE // Needed for StackableDB diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 6e98a48e591..8535952cd3e 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -315,6 +315,16 @@ class StackableDB : public DB { db_->GetColumnFamilyMetaData(column_family, cf_meta); } + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(options, std::move(trace_writer)); + } + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + #endif // ROCKSDB_LITE virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index cf205be72de..00b13033f3d 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -198,7 +198,8 @@ Status BlockBasedTableFactory::NewTableReader( file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, - table_reader_options.largest_seqno, &tail_prefetch_stats_); + table_reader_options.largest_seqno, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 75c8301c5c2..7434188a01d 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1020,19 +1020,17 @@ Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, return Slice(cache_key, static_cast(end - cache_key)); } -Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - std::unique_ptr&& file, - uint64_t file_size, - std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor, - const bool prefetch_index_and_filter_in_cache, - const bool skip_filters, const int level, - const bool immortal_table, - const SequenceNumber largest_seqno, - TailPrefetchStats* tail_prefetch_stats) { +Status BlockBasedTable::Open( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + const SliceTransform* prefix_extractor, + const bool prefetch_index_and_filter_in_cache, const bool skip_filters, + const int level, const bool immortal_table, + const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats, + BlockCacheTracer* const block_cache_tracer) { table_reader->reset(); Status s; @@ -1082,7 +1080,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->internal_prefix_transform.reset( new InternalKeySliceTransform(prefix_extractor)); SetupCacheKeyPrefix(rep); - std::unique_ptr new_table(new BlockBasedTable(rep)); + std::unique_ptr new_table( + new BlockBasedTable(rep, block_cache_tracer)); // page cache options rep->persistent_cache_options = diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 420da25932b..223746b3ac9 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -35,6 +35,7 @@ #include "table/table_properties_internal.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" +#include "trace_replay/block_cache_tracer.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/user_comparator_wrapper.h" @@ -108,7 +109,8 @@ class BlockBasedTable : public TableReader { bool skip_filters = false, int level = -1, const bool immortal_table = false, const SequenceNumber largest_seqno = 0, - TailPrefetchStats* tail_prefetch_stats = nullptr); + TailPrefetchStats* tail_prefetch_stats = nullptr, + BlockCacheTracer* const block_cache_tracer = nullptr); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, @@ -239,11 +241,13 @@ class BlockBasedTable : public TableReader { protected: Rep* rep_; - explicit BlockBasedTable(Rep* rep) : rep_(rep) {} + explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) + : rep_(rep), block_cache_tracer_(block_cache_tracer) {} private: friend class MockedBlockBasedTable; static std::atomic next_cache_key_id_; + BlockCacheTracer* const block_cache_tracer_; void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, size_t usage) const; diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 5af7034968a..34ecfa4ac65 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -23,7 +23,8 @@ std::map slices; class MockedBlockBasedTable : public BlockBasedTable { public: - explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) { + explicit MockedBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test rep->cache_key_prefix_size = 10; } diff --git a/table/table_builder.h b/table/table_builder.h index 21df978c3eb..23189200c64 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -18,6 +18,7 @@ #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" +#include "trace_replay/block_cache_tracer.h" #include "util/file_reader_writer.h" namespace rocksdb { @@ -32,10 +33,12 @@ struct TableReaderOptions { const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, bool _skip_filters = false, bool _immortal = false, - int _level = -1) + int _level = -1, + BlockCacheTracer* const _block_cache_tracer = nullptr) : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, _internal_comparator, _skip_filters, _immortal, - _level, 0 /* _largest_seqno */) {} + _level, 0 /* _largest_seqno */, + _block_cache_tracer) {} // @param skip_filters Disables loading/accessing the filter block TableReaderOptions(const ImmutableCFOptions& _ioptions, @@ -43,7 +46,8 @@ struct TableReaderOptions { const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, bool _skip_filters, bool _immortal, int _level, - SequenceNumber _largest_seqno) + SequenceNumber _largest_seqno, + BlockCacheTracer* const _block_cache_tracer) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -51,7 +55,8 @@ struct TableReaderOptions { skip_filters(_skip_filters), immortal(_immortal), level(_level), - largest_seqno(_largest_seqno) {} + largest_seqno(_largest_seqno), + block_cache_tracer(_block_cache_tracer) {} const ImmutableCFOptions& ioptions; const SliceTransform* prefix_extractor; @@ -65,6 +70,7 @@ struct TableReaderOptions { int level; // largest seqno in the table SequenceNumber largest_seqno; + BlockCacheTracer* const block_cache_tracer; }; struct TableBuilderOptions { diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 958d862fd32..49489173c33 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -954,7 +954,8 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, WriteController wc(options.delayed_write_rate); WriteBufferManager wb(options.db_write_buffer_size); ImmutableDBOptions immutable_db_options(options); - VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc); + VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); Status s = versions.DumpManifest(options, file, verbose, hex, json); if (!s.ok()) { printf("Error in processing file %s %s\n", file.c_str(), @@ -1664,7 +1665,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, const InternalKeyComparator cmp(opt.comparator); WriteController wc(opt.delayed_write_rate); WriteBufferManager wb(opt.db_write_buffer_size); - VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc); + VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 58c7df70b20..565511e5a07 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -23,30 +23,29 @@ bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) { record.caller == BlockCacheLookupCaller::kUserMGet); } -BlockCacheTraceWriter::BlockCacheTraceWriter( - Env* env, const TraceOptions& trace_options, - std::unique_ptr&& trace_writer) - : env_(env), - trace_options_(trace_options), - trace_writer_(std::move(trace_writer)) {} - -bool BlockCacheTraceWriter::ShouldTrace( - const BlockCacheTraceRecord& record) const { - if (trace_options_.sampling_frequency == 0 || - trace_options_.sampling_frequency == 1) { +bool ShouldTrace(const BlockCacheTraceRecord& record, + const TraceOptions& trace_options) { + if (trace_options.sampling_frequency == 0 || + trace_options.sampling_frequency == 1) { return true; } // We use spatial downsampling so that we have a complete access history for a // block. const uint64_t hash = GetSliceNPHash64(Slice(record.block_key)); - return hash % trace_options_.sampling_frequency == 0; + return hash % trace_options.sampling_frequency == 0; } +BlockCacheTraceWriter::BlockCacheTraceWriter( + Env* env, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) + : env_(env), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)) {} + Status BlockCacheTraceWriter::WriteBlockAccess( const BlockCacheTraceRecord& record) { uint64_t trace_file_size = trace_writer_->GetFileSize(); - if (trace_file_size > trace_options_.max_trace_file_size || - !ShouldTrace(record)) { + if (trace_file_size > trace_options_.max_trace_file_size) { return Status::OK(); } Trace trace; @@ -68,7 +67,6 @@ Status BlockCacheTraceWriter::WriteBlockAccess( } std::string encoded_trace; TracerHelper::EncodeTrace(trace, &encoded_trace); - InstrumentedMutexLock lock_guard(&trace_writer_mutex_); return trace_writer_->Write(encoded_trace); } @@ -81,7 +79,6 @@ Status BlockCacheTraceWriter::WriteHeader() { PutFixed32(&trace.payload, kMinorVersion); std::string encoded_trace; TracerHelper::EncodeTrace(trace, &encoded_trace); - InstrumentedMutexLock lock_guard(&trace_writer_mutex_); return trace_writer_->Write(encoded_trace); } @@ -216,4 +213,41 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::OK(); } +BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); } + +BlockCacheTracer::~BlockCacheTracer() { EndTrace(); } + +Status BlockCacheTracer::StartTrace( + Env* env, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (writer_.load()) { + return Status::OK(); + } + trace_options_ = trace_options; + writer_.store( + new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer))); + return writer_.load()->WriteHeader(); +} + +void BlockCacheTracer::EndTrace() { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return; + } + delete writer_.load(); + writer_.store(nullptr); +} + +Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) { + if (!writer_.load() || !ShouldTrace(record, trace_options_)) { + return Status::OK(); + } + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return Status::OK(); + } + return writer_.load()->WriteBlockAccess(record); +} + } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index e24d5a5ef35..320e6d67b3c 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -5,6 +5,8 @@ #pragma once +#include + #include "monitoring/instrumented_mutex.h" #include "rocksdb/env.h" #include "rocksdb/options.h" @@ -101,13 +103,9 @@ class BlockCacheTraceWriter { Status WriteHeader(); private: - bool ShouldTrace(const BlockCacheTraceRecord& record) const; - Env* env_; TraceOptions trace_options_; std::unique_ptr trace_writer_; - /*Mutex to protect trace_writer_ */ - InstrumentedMutex trace_writer_mutex_; }; // BlockCacheTraceReader helps read the trace file generated by @@ -130,4 +128,33 @@ class BlockCacheTraceReader { std::unique_ptr trace_reader_; }; +// A block cache tracer. It downsamples the accesses according to +// trace_options and uses BlockCacheTraceWriter to write the access record to +// the trace file. +class BlockCacheTracer { + public: + BlockCacheTracer(); + ~BlockCacheTracer(); + // No copy and move. + BlockCacheTracer(const BlockCacheTracer&) = delete; + BlockCacheTracer& operator=(const BlockCacheTracer&) = delete; + BlockCacheTracer(BlockCacheTracer&&) = delete; + BlockCacheTracer& operator=(BlockCacheTracer&&) = delete; + + // Start writing block cache accesses to the trace_writer. + Status StartTrace(Env* env, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer); + + // Stop writing block cache accesses to the trace_writer. + void EndTrace(); + + Status WriteBlockAccess(const BlockCacheTraceRecord& record); + + private: + TraceOptions trace_options_; + // A mutex protects the writer_. + InstrumentedMutex trace_writer_mutex_; + std::atomic writer_; +}; + } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index 28052d9db8d..c6fc3e4acee 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -80,6 +80,26 @@ class BlockCacheTracerTest : public testing::Test { } } + BlockCacheTraceRecord GenerateAccessRecord() { + uint32_t key_id = 0; + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceDataBlock; + record.block_size = kBlockSize; + record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.access_timestamp = env_->NowMicros(); + record.cf_id = kCFId; + record.cf_name = kDefaultColumnFamilyName; + record.caller = GetCaller(key_id); + record.level = kLevel; + record.sst_fd_number = kSSTFDNumber + key_id; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + record.referenced_key = kRefKeyPrefix + std::to_string(key_id); + record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.num_keys_in_block = kNumKeysInBlock; + return record; + } + void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id, TraceType block_type, uint32_t nblocks) { assert(reader); @@ -118,6 +138,88 @@ class BlockCacheTracerTest : public testing::Test { std::string test_path_; }; +TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) { + BlockCacheTraceRecord record = GenerateAccessRecord(); + { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTracer writer; + // The record should be written to the trace_file since StartTrace is not + // called. + ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + { + // Verify trace file contains nothing. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_NOK(reader.ReadHeader(&header)); + } +} + +TEST_F(BlockCacheTracerTest, AtomicWrite) { + BlockCacheTraceRecord record = GenerateAccessRecord(); + { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTracer writer; + ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + { + // Verify trace file contains one record. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, header.rocksdb_major_version); + ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version); + VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1); + ASSERT_NOK(reader.ReadAccess(&record)); + } +} + +TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) { + BlockCacheTraceRecord record = GenerateAccessRecord(); + { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTracer writer; + ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_OK(writer.WriteBlockAccess(record)); + writer.EndTrace(); + // Write the record again. This time the record should not be written since + // EndTrace is called. + ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + { + // Verify trace file contains one record. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, header.rocksdb_major_version); + ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version); + VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1); + ASSERT_NOK(reader.ReadAccess(&record)); + } +} + TEST_F(BlockCacheTracerTest, MixedBlocks) { { // Generate a trace file containing a mix of blocks. From 89695bfbaafd6fd589ad37e31ab27d9cf25e9930 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 14 Jun 2019 09:13:48 -0700 Subject: [PATCH 143/572] Remove unused variable (#5457) Summary: This PR removes the unused variable that causes CLANG build to fail. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5457 Differential Revision: D15825027 Pulled By: HaoyuHuang fbshipit-source-id: 72c847c39ca310560efcbc5938cffa6f31164068 --- trace_replay/block_cache_tracer_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index c6fc3e4acee..0f3ca67c611 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -141,7 +141,6 @@ class BlockCacheTracerTest : public testing::Test { TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) { BlockCacheTraceRecord record = GenerateAccessRecord(); { - TraceOptions trace_opt; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer)); From 58c78358ef0442ec3adeffa1df1dd43a593177ce Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 14 Jun 2019 10:33:45 -0700 Subject: [PATCH 144/572] Set executeLocal on child lego jobs (#5456) Summary: This property is needed to run the child jobs on the same host and thus propagate the child job status back to the parent's. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5456 Reviewed By: yancouto Differential Revision: D15824382 Pulled By: maysamyabandeh fbshipit-source-id: 42f2efbedaa3a8b399281105f0ce793c1c9a6191 --- build_tools/rocksdb-lego-determinator | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index e47b2ef30d8..dc32b3af9ff 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -133,6 +133,7 @@ UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -153,6 +154,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -175,6 +177,7 @@ RELEASE_BUILD_COMMANDS="[ { 'name':'Rocksdb Release Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -195,6 +198,7 @@ UNIT_TEST_COMMANDS_481="[ { 'name':'Rocksdb Unit Test on GCC 4.8.1', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -215,6 +219,7 @@ RELEASE_BUILD_COMMANDS_481="[ { 'name':'Rocksdb Release on GCC 4.8.1', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -235,6 +240,7 @@ CLANG_UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -255,6 +261,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[ { 'name':'Rocksdb CLANG Release Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -275,6 +282,7 @@ CLANG_ANALYZE_COMMANDS="[ { 'name':'Rocksdb analyze', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -295,6 +303,7 @@ CODE_COV_COMMANDS="[ { 'name':'Rocksdb Unit Test Code Coverage', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -315,6 +324,7 @@ UNITY_COMMANDS="[ { 'name':'Rocksdb Unity', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -335,6 +345,7 @@ LITE_BUILD_COMMANDS="[ { 'name':'Rocksdb Lite build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -354,6 +365,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[ { 'name':'Rocksdb Lite Binary Size', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -371,6 +383,7 @@ STRESS_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb Stress and Crash Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -399,6 +412,7 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Rocksdb Stress and Crash Test with atomic flush', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -427,6 +441,7 @@ WRITE_STRESS_COMMANDS="[ { 'name':'Rocksdb Write Stress Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -449,6 +464,7 @@ ASAN_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -469,6 +485,7 @@ ASAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb crash test under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -491,6 +508,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Rocksdb crash test with atomic flush under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -513,6 +531,7 @@ UBSAN_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -533,6 +552,7 @@ UBSAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb crash test under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -555,6 +575,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Rocksdb crash test with atomic flush under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -577,6 +598,7 @@ VALGRIND_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under valgrind', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -599,6 +621,7 @@ TSAN_UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -621,6 +644,7 @@ TSAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb Crash Test under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -643,6 +667,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Rocksdb Crash Test with atomic flush under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -675,6 +700,7 @@ FORMAT_COMPATIBLE_COMMANDS="[ { 'name':'Rocksdb Format Compatible tests', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -708,6 +734,7 @@ NO_COMPRESSION_COMMANDS="[ { 'name':'Rocksdb No Compression tests', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -785,6 +812,7 @@ JAVA_BUILD_TEST_COMMANDS="[ { 'name':'Rocksdb Java Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { From b47cfec5d01fd8c848b7539e5b43884257dba613 Mon Sep 17 00:00:00 2001 From: Huisheng Liu Date: Fri, 14 Jun 2019 11:24:02 -0700 Subject: [PATCH 145/572] fix compilation error on MSVC (#5458) Summary: "__attribute__((__weak__))" was introduced in port\jemalloc_helper.h. It's not supported by Microsoft VS 2015, resulting in compile error. This fix adds a #if branch to work around the compile issue. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5458 Differential Revision: D15827285 fbshipit-source-id: 8c5f7ad31de1ac677bd96f16c4450767de834beb --- port/jemalloc_helper.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index 0c216face13..26e5fb66336 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -16,6 +16,14 @@ #define JEMALLOC_CXX_THROW #endif +#if defined(OS_WIN) && defined(_MSC_VER) + +// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is defined, +// Jemalloc memory allocator is used. +static inline bool HasJemalloc() { return true; } + +#else + // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. extern "C" void* mallocx(size_t, int) __attribute__((__weak__)); @@ -50,4 +58,6 @@ static inline bool HasJemalloc() { malloc_stats_print != nullptr && malloc_usable_size != nullptr; } +#endif + #endif // ROCKSDB_JEMALLOC From f1219644ec834a96f3a2a13d83046126e8e7409d Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 14 Jun 2019 14:07:50 -0700 Subject: [PATCH 146/572] Validate CF Options when creating a new column family (#5453) Summary: It seems like CF Options are not properly validated when creating a new column family with `CreateColumnFamily` API; only a selected few checks are done. Calling `ColumnFamilyData::ValidateOptions`, which is the single source for all CFOptions validations, will help fix this. (`ColumnFamilyData::ValidateOptions` is already called at the time of `DB::Open`). **Test Plan:** Added a new test: `DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions` ``` TEST_TMPDIR=/dev/shm ./db_test --gtest_filter=DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions ``` Also ran gtest-parallel to make sure the new test is not flaky. ``` TEST_TMPDIR=/dev/shm ~/gtest-parallel/gtest-parallel ./db_test --gtest_filter=DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions --repeat=10000 [10000/10000] DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions (15 ms) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5453 Differential Revision: D15816851 Pulled By: sagar0 fbshipit-source-id: 9e702b9850f5c4a7e0ef8d39e1e6f9b81e7fe1e5 --- HISTORY.md | 1 + db/db_impl/db_impl.cc | 10 +++------- db/db_test.cc | 13 +++++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5574c769878..228d02b61df 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,7 @@ ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. +* Improve ColumnFamilyOptions validation when creating a new column family. ### Bug Fixes * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index af39b5ca11d..154e6dd2339 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1944,13 +1944,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, Status persist_options_status; *handle = nullptr; - s = CheckCompressionSupported(cf_options); - if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cf_options); - } - if (s.ok()) { - s = CheckCFPathsSupported(initial_db_options_, cf_options); - } + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); if (s.ok()) { for (auto& cf_path : cf_options.cf_paths) { s = env_->CreateDirIfMissing(cf_path.path); diff --git a/db/db_test.cc b/db/db_test.cc index 3bac53f2f0a..0204f4d9f62 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -5978,6 +5978,19 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { } } +TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { + Options options = CurrentOptions(); + options.max_open_files = 100; + Reopen(options); + + ColumnFamilyOptions cf_options(options); + // ttl is only supported when max_open_files is -1. + cf_options.ttl = 3600; + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "pikachu", &handle)); + delete handle; +} + #ifndef ROCKSDB_LITE TEST_F(DBTest, RowCache) { Options options = CurrentOptions(); From 7a8d7358bb40b13a06c2c6adc62e80295d89ed05 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 14 Jun 2019 17:37:24 -0700 Subject: [PATCH 147/572] Integrate block cache tracer in block based table reader. (#5441) Summary: This PR integrates the block cache tracer into block based table reader. The tracer will write the block cache accesses using the trace_writer. The tracer is null in this PR so that nothing will be logged. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5441 Differential Revision: D15772029 Pulled By: HaoyuHuang fbshipit-source-id: a64adb92642cd23222e0ba8b10d86bf522b42f9b --- table/block_based/block_based_table_reader.cc | 265 ++++++++++++++---- table/block_based/block_based_table_reader.h | 18 ++ tools/block_cache_trace_analyzer.h | 5 +- tools/block_cache_trace_analyzer_test.cc | 5 +- trace_replay/block_cache_tracer.cc | 62 ++-- trace_replay/block_cache_tracer.h | 89 +++++- trace_replay/block_cache_tracer_test.cc | 31 +- 7 files changed, 365 insertions(+), 110 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 7434188a01d..0caea508822 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1877,9 +1877,8 @@ CachableEntry BlockBasedTable::GetFilter( CachableEntry BlockBasedTable::GetFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/, + BlockCacheLookupContext* lookup_context, const SliceTransform* prefix_extractor) const { - // TODO(haoyu): Trace filter block access here. // If cache_index_and_filter_blocks is false, filter should be pre-populated. // We will return rep_->filter anyway. rep_->filter can be nullptr if filter // read fails at Open() time. We don't want to reload again since it will @@ -1912,17 +1911,22 @@ CachableEntry BlockBasedTable::GetFilter( GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context); FilterBlockReader* filter = nullptr; + size_t usage = 0; + bool is_cache_hit = false; + bool return_empty_reader = false; if (cache_handle != nullptr) { filter = reinterpret_cast(block_cache->Value(cache_handle)); + usage = filter->ApproximateMemoryUsage(); + is_cache_hit = true; } else if (no_io) { // Do not invoke any io. - return CachableEntry(); + return_empty_reader = true; } else { filter = ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition, prefix_extractor); if (filter != nullptr) { - size_t usage = filter->ApproximateMemoryUsage(); + usage = filter->ApproximateMemoryUsage(); Status s = block_cache->Insert( key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, rep_->table_options.cache_index_and_filter_blocks_with_high_priority @@ -1934,19 +1938,36 @@ CachableEntry BlockBasedTable::GetFilter( } else { RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); delete filter; - return CachableEntry(); + return_empty_reader = true; } } } + if (block_cache_tracer_ && lookup_context) { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", TraceType::kBlockTraceFilterBlock, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + /*no_insert=*/no_io); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } + + if (return_empty_reader) { + return CachableEntry(); + } return {filter, cache_handle ? block_cache : nullptr, cache_handle, /*own_value=*/false}; } CachableEntry BlockBasedTable::GetUncompressionDict( FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/) const { - // TODO(haoyu): Trace the access on the uncompression dictionary here. + BlockCacheLookupContext* lookup_context) const { if (!rep_->table_options.cache_index_and_filter_blocks) { // block cache is either disabled or not used for meta-blocks. In either // case, BlockBasedTableReader is the owner of the uncompression dictionary. @@ -1964,9 +1985,13 @@ CachableEntry BlockBasedTable::GetUncompressionDict( GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key, BlockType::kCompressionDictionary, get_context); UncompressionDict* dict = nullptr; + bool is_cache_hit = false; + size_t usage = 0; if (cache_handle != nullptr) { dict = reinterpret_cast( rep_->table_options.block_cache->Value(cache_handle)); + is_cache_hit = true; + usage = dict->ApproximateMemoryUsage(); } else if (no_io) { // Do not invoke any io. } else { @@ -1980,7 +2005,7 @@ CachableEntry BlockBasedTable::GetUncompressionDict( new UncompressionDict(compression_dict_block->data.ToString(), rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); - const size_t usage = uncompression_dict->ApproximateMemoryUsage(); + usage = uncompression_dict->ApproximateMemoryUsage(); s = rep_->table_options.block_cache->Insert( cache_key, uncompression_dict.get(), usage, &DeleteCachedUncompressionDictEntry, &cache_handle, @@ -2000,6 +2025,20 @@ CachableEntry BlockBasedTable::GetUncompressionDict( } } } + if (block_cache_tracer_ && lookup_context) { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + /*no_insert=*/no_io); + block_cache_tracer_->WriteBlockAccess(access_record, cache_key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, cache_handle, false /* own_value */}; } @@ -2116,13 +2155,10 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/) const { - // TODO(haoyu): Trace data/index/range deletion block access here. + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); - // No point to cache compressed blocks if it never goes away Cache* block_cache_compressed = rep_->immortal_table ? nullptr @@ -2136,6 +2172,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key /* key to the block cache */; Slice ckey /* key to the compressed block cache */; + bool is_cache_hit = false; + bool no_insert = true; if (block_cache != nullptr || block_cache_compressed != nullptr) { // create key for block cache if (block_cache != nullptr) { @@ -2152,10 +2190,15 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, ro, block_entry, uncompression_dict, block_type, get_context); - + if (block_entry->GetValue()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + } // Can't find the block from the cache. If I/O is allowed, read from the // file. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { + no_insert = false; Statistics* statistics = rep_->ioptions.statistics; bool do_decompress = block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; @@ -2186,6 +2229,59 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } } } + + // Fill lookup_context. + if (block_cache_tracer_ && lookup_context) { + size_t usage = 0; + uint64_t nkeys = 0; + if (block_entry->GetValue()) { + // Approximate the number of keys in the block using restarts. + nkeys = rep_->table_options.block_restart_interval * + block_entry->GetValue()->NumRestarts(); + usage = block_entry->GetValue()->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (block_type) { + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + if (BlockCacheTraceHelper::ShouldTraceReferencedKey( + trace_block_type, lookup_context->caller)) { + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., the referenced key, + // referenced_key_exist_in_block. + + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext( + is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys); + } else { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", trace_block_type, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + no_insert); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } + } + assert(s.ok() || block_entry->GetValue() == nullptr); return s; } @@ -2874,11 +2970,15 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; } else { + BlockCacheLookupContext lookup_data_block_context{ + BlockCacheLookupCaller::kUserGet}; + bool does_referenced_key_exist = false; DataBlockIter biter; + uint64_t referenced_data_size = 0; NewDataBlockIterator( read_options, iiter->value(), &biter, BlockType::kData, /*key_includes_seq=*/true, - /*index_key_is_full=*/true, get_context, &lookup_context, + /*index_key_is_full=*/true, get_context, &lookup_data_block_context, /*s=*/Status(), /*prefetch_buffer*/ nullptr); if (read_options.read_tier == kBlockCacheTier && @@ -2902,25 +3002,47 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - done = true; - break; + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; + } } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } - s = biter.status(); } + if (done) { // Avoid the extra Next which is expensive in two-level indexes break; @@ -2992,14 +3114,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { bool reusing_block = true; + uint64_t referenced_data_size = 0; + bool does_referenced_key_exist = false; + BlockCacheLookupContext lookup_data_block_context( + BlockCacheLookupCaller::kUserMGet); if (iiter->value().offset() != offset) { offset = iiter->value().offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( read_options, iiter->value(), &biter, BlockType::kData, /*key_includes_seq=*/false, - /*index_key_is_full=*/true, get_context, &lookup_context, - Status(), nullptr); + /*index_key_is_full=*/true, get_context, + &lookup_data_block_context, Status(), nullptr); reusing_block = false; } if (read_options.read_tier == kBlockCacheTier && @@ -3021,38 +3147,59 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - Cleanable dummy; - Cleanable* value_pinner = nullptr; - - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - if (biter.IsValuePinned()) { - if (reusing_block) { - Cache* block_cache = rep_->table_options.block_cache.get(); - assert(biter.cache_handle() != nullptr); - block_cache->Ref(biter.cache_handle()); - dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, - biter.cache_handle()); - value_pinner = &dummy; - } else { - value_pinner = &biter; + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + if (biter.IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter.cache_handle() != nullptr); + block_cache->Ref(biter.cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter.cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = &biter; + } } - } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, value_pinner)) { - done = true; - break; + if (!get_context->SaveValue(parsed_key, biter.value(), &matched, + value_pinner)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; + } } + s = biter.status(); + } + // Write the block cache access. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } - s = biter.status(); if (done) { // Avoid the extra Next which is expensive in two-level indexes break; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 223746b3ac9..17c4e7238c8 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -17,6 +17,7 @@ #include #include "db/range_tombstone_fragmenter.h" +#include "file/filename.h" #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/persistent_cache.h" @@ -571,6 +572,23 @@ struct BlockBasedTable::Rep { ? kDisableGlobalSequenceNumber : global_seqno; } + + uint64_t cf_id_for_tracing() const { + return table_properties ? table_properties->column_family_id + : rocksdb::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily; + } + + Slice cf_name_for_tracing() const { + return table_properties ? table_properties->column_family_name + : BlockCacheTraceHelper::kUnknownColumnFamilyName; + } + + uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } + + uint64_t sst_number_for_tracing() const { + return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; + } }; // Iterates over the contents of BlockBasedTable. diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 9dde8a939b5..51bb1ec7930 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -35,10 +35,11 @@ struct BlockAccessInfo { block_size = access.block_size; caller_num_access_map[access.caller]++; num_accesses++; - if (ShouldTraceReferencedKey(access)) { + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type, + access.caller)) { num_keys = access.num_keys_in_block; - if (access.is_referenced_key_exist_in_block == Boolean::kTrue) { + if (access.referenced_key_exist_in_block == Boolean::kTrue) { key_num_access_map[access.referenced_key]++; num_referenced_key_exist_in_block++; } else { diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index 96f52c1ec00..a75804492f6 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -89,9 +89,10 @@ class BlockCacheTracerTest : public testing::Test { // The writer should only write these fields for data blocks and the // caller is either GET or MGET. record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; - ASSERT_OK(writer->WriteBlockAccess(record)); + ASSERT_OK(writer->WriteBlockAccess( + record, record.block_key, record.cf_name, record.referenced_key)); } } diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 565511e5a07..f733bc9005f 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -15,13 +15,6 @@ namespace rocksdb { namespace { const unsigned int kCharSize = 1; -} // namespace - -bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) { - return (record.block_type == TraceType::kBlockTraceDataBlock) && - (record.caller == BlockCacheLookupCaller::kUserGet || - record.caller == BlockCacheLookupCaller::kUserMGet); -} bool ShouldTrace(const BlockCacheTraceRecord& record, const TraceOptions& trace_options) { @@ -34,6 +27,17 @@ bool ShouldTrace(const BlockCacheTraceRecord& record, const uint64_t hash = GetSliceNPHash64(Slice(record.block_key)); return hash % trace_options.sampling_frequency == 0; } +} // namespace + +const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = + "UnknownColumnFamily"; + +bool BlockCacheTraceHelper::ShouldTraceReferencedKey( + TraceType block_type, BlockCacheLookupCaller caller) { + return (block_type == TraceType::kBlockTraceDataBlock) && + (caller == BlockCacheLookupCaller::kUserGet || + caller == BlockCacheLookupCaller::kUserMGet); +} BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, @@ -43,7 +47,8 @@ BlockCacheTraceWriter::BlockCacheTraceWriter( trace_writer_(std::move(trace_writer)) {} Status BlockCacheTraceWriter::WriteBlockAccess( - const BlockCacheTraceRecord& record) { + const BlockCacheTraceRecord& record, const Slice& block_key, + const Slice& cf_name, const Slice& referenced_key) { uint64_t trace_file_size = trace_writer_->GetFileSize(); if (trace_file_size > trace_options_.max_trace_file_size) { return Status::OK(); @@ -51,19 +56,21 @@ Status BlockCacheTraceWriter::WriteBlockAccess( Trace trace; trace.ts = record.access_timestamp; trace.type = record.block_type; - PutLengthPrefixedSlice(&trace.payload, record.block_key); + PutLengthPrefixedSlice(&trace.payload, block_key); PutFixed64(&trace.payload, record.block_size); - PutFixed32(&trace.payload, record.cf_id); - PutLengthPrefixedSlice(&trace.payload, record.cf_name); + PutFixed64(&trace.payload, record.cf_id); + PutLengthPrefixedSlice(&trace.payload, cf_name); PutFixed32(&trace.payload, record.level); - PutFixed32(&trace.payload, record.sst_fd_number); + PutFixed64(&trace.payload, record.sst_fd_number); trace.payload.push_back(record.caller); trace.payload.push_back(record.is_cache_hit); trace.payload.push_back(record.no_insert); - if (ShouldTraceReferencedKey(record)) { - PutLengthPrefixedSlice(&trace.payload, record.referenced_key); + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type, + record.caller)) { + PutLengthPrefixedSlice(&trace.payload, referenced_key); + PutFixed64(&trace.payload, record.referenced_data_size); PutFixed64(&trace.payload, record.num_keys_in_block); - trace.payload.push_back(record.is_referenced_key_exist_in_block); + trace.payload.push_back(record.referenced_key_exist_in_block); } std::string encoded_trace; TracerHelper::EncodeTrace(trace, &encoded_trace); @@ -143,6 +150,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { record->access_timestamp = trace.ts; record->block_type = trace.type; Slice enc_slice = Slice(trace.payload); + Slice block_key; if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) { return Status::Incomplete( @@ -153,7 +161,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::Incomplete( "Incomplete access record: Failed to read block size."); } - if (!GetFixed32(&enc_slice, &record->cf_id)) { + if (!GetFixed64(&enc_slice, &record->cf_id)) { return Status::Incomplete( "Incomplete access record: Failed to read column family ID."); } @@ -167,7 +175,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::Incomplete( "Incomplete access record: Failed to read level."); } - if (!GetFixed32(&enc_slice, &record->sst_fd_number)) { + if (!GetFixed64(&enc_slice, &record->sst_fd_number)) { return Status::Incomplete( "Incomplete access record: Failed to read SST file number."); } @@ -190,13 +198,18 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { record->no_insert = static_cast(enc_slice[0]); enc_slice.remove_prefix(kCharSize); - if (ShouldTraceReferencedKey(*record)) { + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type, + record->caller)) { Slice referenced_key; if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) { return Status::Incomplete( "Incomplete access record: Failed to read the referenced key."); } record->referenced_key = referenced_key.ToString(); + if (!GetFixed64(&enc_slice, &record->referenced_data_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the referenced data size."); + } if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) { return Status::Incomplete( "Incomplete access record: Failed to read the number of keys in the " @@ -205,10 +218,9 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { if (enc_slice.empty()) { return Status::Incomplete( "Incomplete access record: Failed to read " - "is_referenced_key_exist_in_block."); + "referenced_key_exist_in_block."); } - record->is_referenced_key_exist_in_block = - static_cast(enc_slice[0]); + record->referenced_key_exist_in_block = static_cast(enc_slice[0]); } return Status::OK(); } @@ -239,7 +251,10 @@ void BlockCacheTracer::EndTrace() { writer_.store(nullptr); } -Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) { +Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, + const Slice& cf_name, + const Slice& referenced_key) { if (!writer_.load() || !ShouldTrace(record, trace_options_)) { return Status::OK(); } @@ -247,7 +262,8 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) { if (!writer_.load()) { return Status::OK(); } - return writer_.load()->WriteBlockAccess(record); + return writer_.load()->WriteBlockAccess(record, block_key, cf_name, + referenced_key); } } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 320e6d67b3c..bf88133111e 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -49,28 +49,80 @@ struct BlockCacheLookupContext { BlockCacheLookupContext(const BlockCacheLookupCaller& _caller) : caller(_caller) {} const BlockCacheLookupCaller caller; + // These are populated when we perform lookup/insert on block cache. The block + // cache tracer uses these inforation when logging the block access at + // BlockBasedTable::GET and BlockBasedTable::MultiGet. + bool is_cache_hit = false; + bool no_insert = false; + TraceType block_type = TraceType::kTraceMax; + uint64_t block_size = 0; + std::string block_key; + uint64_t num_keys_in_block = 0; + + void FillLookupContext(bool _is_cache_hit, bool _no_insert, + TraceType _block_type, uint64_t _block_size, + const std::string& _block_key, + uint64_t _num_keys_in_block) { + is_cache_hit = _is_cache_hit; + no_insert = _no_insert; + block_type = _block_type; + block_size = _block_size; + block_key = _block_key; + num_keys_in_block = _num_keys_in_block; + } }; enum Boolean : char { kTrue = 1, kFalse = 0 }; struct BlockCacheTraceRecord { // Required fields for all accesses. - uint64_t access_timestamp; + uint64_t access_timestamp = 0; std::string block_key; - TraceType block_type; - uint64_t block_size; - uint32_t cf_id; + TraceType block_type = TraceType::kTraceMax; + uint64_t block_size = 0; + uint64_t cf_id = 0; std::string cf_name; - uint32_t level; - uint32_t sst_fd_number; - BlockCacheLookupCaller caller; - Boolean is_cache_hit; - Boolean no_insert; + uint32_t level = 0; + uint64_t sst_fd_number = 0; + BlockCacheLookupCaller caller = + BlockCacheLookupCaller::kMaxBlockCacheLookupCaller; + Boolean is_cache_hit = Boolean::kFalse; + Boolean no_insert = Boolean::kFalse; // Required fields for data block and user Get/Multi-Get only. std::string referenced_key; + uint64_t referenced_data_size = 0; uint64_t num_keys_in_block = 0; - Boolean is_referenced_key_exist_in_block = Boolean::kFalse; + Boolean referenced_key_exist_in_block = Boolean::kFalse; + + BlockCacheTraceRecord() {} + + BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, + TraceType _block_type, uint64_t _block_size, + uint64_t _cf_id, std::string _cf_name, uint32_t _level, + uint64_t _sst_fd_number, BlockCacheLookupCaller _caller, + bool _is_cache_hit, bool _no_insert, + std::string _referenced_key = "", + uint64_t _referenced_data_size = 0, + uint64_t _num_keys_in_block = 0, + bool _referenced_key_exist_in_block = false) + : access_timestamp(_access_timestamp), + block_key(_block_key), + block_type(_block_type), + block_size(_block_size), + cf_id(_cf_id), + cf_name(_cf_name), + level(_level), + sst_fd_number(_sst_fd_number), + caller(_caller), + is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse), + no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse), + referenced_key(_referenced_key), + referenced_data_size(_referenced_data_size), + num_keys_in_block(_num_keys_in_block), + referenced_key_exist_in_block( + _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) { + } }; struct BlockCacheTraceHeader { @@ -79,7 +131,13 @@ struct BlockCacheTraceHeader { uint32_t rocksdb_minor_version; }; -bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record); +class BlockCacheTraceHelper { + public: + static bool ShouldTraceReferencedKey(TraceType block_type, + BlockCacheLookupCaller caller); + + static const std::string kUnknownColumnFamilyName; +}; // BlockCacheTraceWriter captures all RocksDB block cache accesses using a // user-provided TraceWriter. Every RocksDB operation is written as a single @@ -96,7 +154,10 @@ class BlockCacheTraceWriter { BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete; BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete; - Status WriteBlockAccess(const BlockCacheTraceRecord& record); + // Pass Slice references to avoid copy. + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key); // Write a trace header at the beginning, typically on initiating a trace, // with some metadata like a magic number and RocksDB version. @@ -148,7 +209,9 @@ class BlockCacheTracer { // Stop writing block cache accesses to the trace_writer. void EndTrace(); - Status WriteBlockAccess(const BlockCacheTraceRecord& record); + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key); private: TraceOptions trace_options_; diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index 0f3ca67c611..95fe16b8c8f 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -20,6 +20,7 @@ const uint32_t kLevel = 1; const uint64_t kSSTFDNumber = 100; const std::string kRefKeyPrefix = "test-get-"; const uint64_t kNumKeysInBlock = 1024; +const uint64_t kReferencedDataSize = 10; } // namespace class BlockCacheTracerTest : public testing::Test { @@ -61,7 +62,7 @@ class BlockCacheTracerTest : public testing::Test { BlockCacheTraceRecord record; record.block_type = block_type; record.block_size = kBlockSize + key_id; - record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.block_key = (kBlockKeyPrefix + std::to_string(key_id)); record.access_timestamp = env_->NowMicros(); record.cf_id = kCFId; record.cf_name = kDefaultColumnFamilyName; @@ -73,10 +74,12 @@ class BlockCacheTracerTest : public testing::Test { // Provide these fields for all block types. // The writer should only write these fields for data blocks and the // caller is either GET or MGET. - record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key = (kRefKeyPrefix + std::to_string(key_id)); + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; - ASSERT_OK(writer->WriteBlockAccess(record)); + record.referenced_data_size = kReferencedDataSize + key_id; + ASSERT_OK(writer->WriteBlockAccess( + record, record.block_key, record.cf_name, record.referenced_key)); } } @@ -95,7 +98,7 @@ class BlockCacheTracerTest : public testing::Test { record.is_cache_hit = Boolean::kFalse; record.no_insert = Boolean::kFalse; record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; return record; } @@ -122,13 +125,15 @@ class BlockCacheTracerTest : public testing::Test { record.caller == BlockCacheLookupCaller::kUserMGet)) { ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), record.referenced_key); - ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block); + ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block); ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block); + ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size); continue; } ASSERT_EQ("", record.referenced_key); - ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block); + ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block); ASSERT_EQ(0, record.num_keys_in_block); + ASSERT_EQ(0, record.referenced_data_size); } } @@ -147,7 +152,8 @@ TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) { BlockCacheTracer writer; // The record should be written to the trace_file since StartTrace is not // called. - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } { @@ -170,7 +176,8 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) { &trace_writer)); BlockCacheTracer writer; ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } { @@ -197,11 +204,13 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) { &trace_writer)); BlockCacheTracer writer; ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); writer.EndTrace(); // Write the record again. This time the record should not be written since // EndTrace is called. - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } { From d1ae67bdb921e32b7d5c2ad614a1b69faab64c9c Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Mon, 17 Jun 2019 10:15:58 -0700 Subject: [PATCH 148/572] Switch Travis to Xenial build (#4789) Summary: I think this should now also run on Travis's new virtualised infrastructure which affords more memory and CPU. We also need to think about migrating from travis-ci.org to travis-ci.com. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4789 Differential Revision: D15856272 fbshipit-source-id: 10b41d21924e8a362bc9646a63ccd1a5dfc437c6 --- .travis.yml | 13 +- CMakeLists.txt | 1 + java/CMakeLists.txt | 301 +++++++++++++++++++++++++------------------- java/Makefile | 15 +++ 4 files changed, 193 insertions(+), 137 deletions(-) diff --git a/.travis.yml b/.travis.yml index e759a642a0c..75eaac8eab5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,4 @@ -sudo: false -dist: trusty +dist: xenial language: cpp os: - linux @@ -9,7 +8,7 @@ compiler: - gcc osx_image: xcode8.3 jdk: - - oraclejdk7 + - openjdk7 cache: - ccache - apt @@ -71,7 +70,10 @@ install: CC=gcc-8 && CXX=g++-8; fi - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then - mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH; + mkdir cmake-dist && curl -sfSL https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH; + fi + - if [[ "${JOB_NAME}" == java_test ]]; then + java -version && echo "JAVA_HOME=${JAVA_HOME}"; fi before_script: @@ -101,7 +103,7 @@ script: esac - case $JOB_NAME in java_test) - OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest + OPT=-DTRAVIS V=1 make rocksdbjava jtest ;; lite_build) OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools @@ -110,6 +112,7 @@ script: OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4 ;; cmake-mingw) + sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix; mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni ;; cmake*) diff --git a/CMakeLists.txt b/CMakeLists.txt index 006f6798666..eda1281e149 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,6 +183,7 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") + add_definitions(-D_POSIX_C_SOURCE=1) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 360951834a7..f00b6f7f919 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.4) +if(${CMAKE_VERSION} VERSION_LESS "3.11.4") + message("Please consider switching to CMake 3.11.4 or newer") +endif() + set(JNI_NATIVE_SOURCES rocksjni/backupablejni.cc rocksjni/backupenginejni.cc @@ -11,9 +15,9 @@ set(JNI_NATIVE_SOURCES rocksjni/compaction_filter.cc rocksjni/compaction_filter_factory.cc rocksjni/compaction_filter_factory_jnicallback.cc - rocksjni/compaction_job_info.cc - rocksjni/compaction_job_stats.cc - rocksjni/compaction_options.cc + rocksjni/compaction_job_info.cc + rocksjni/compaction_job_stats.cc + rocksjni/compaction_options.cc rocksjni/compaction_options_fifo.cc rocksjni/compaction_options_universal.cc rocksjni/compact_range_options.cc @@ -72,125 +76,9 @@ set(JNI_NATIVE_SOURCES rocksjni/write_buffer_manager.cc ) -set(NATIVE_JAVA_CLASSES - org.rocksdb.AbstractCompactionFilter - org.rocksdb.AbstractCompactionFilterFactory - org.rocksdb.AbstractComparator - org.rocksdb.AbstractImmutableNativeReference - org.rocksdb.AbstractNativeReference - org.rocksdb.AbstractRocksIterator - org.rocksdb.AbstractSlice - org.rocksdb.AbstractTableFilter - org.rocksdb.AbstractTraceWriter - org.rocksdb.AbstractTransactionNotifier - org.rocksdb.AbstractWalFilter - org.rocksdb.BackupableDBOptions - org.rocksdb.BackupEngine - org.rocksdb.BlockBasedTableConfig - org.rocksdb.BloomFilter - org.rocksdb.CassandraCompactionFilter - org.rocksdb.CassandraValueMergeOperator - org.rocksdb.Checkpoint - org.rocksdb.ClockCache - org.rocksdb.ColumnFamilyHandle - org.rocksdb.ColumnFamilyOptions - org.rocksdb.CompactionJobInfo - org.rocksdb.CompactionJobStats - org.rocksdb.CompactionOptions - org.rocksdb.CompactionOptionsFIFO - org.rocksdb.CompactionOptionsUniversal - org.rocksdb.CompactRangeOptions - org.rocksdb.Comparator - org.rocksdb.ComparatorOptions - org.rocksdb.CompressionOptions - org.rocksdb.DBOptions - org.rocksdb.DirectComparator - org.rocksdb.DirectSlice - org.rocksdb.Env - org.rocksdb.EnvOptions - org.rocksdb.Filter - org.rocksdb.FlushOptions - org.rocksdb.HashLinkedListMemTableConfig - org.rocksdb.HashSkipListMemTableConfig - org.rocksdb.HdfsEnv - org.rocksdb.IngestExternalFileOptions - org.rocksdb.Logger - org.rocksdb.LRUCache - org.rocksdb.MemoryUtil - org.rocksdb.MemTableConfig - org.rocksdb.NativeComparatorWrapper - org.rocksdb.NativeLibraryLoader - org.rocksdb.OptimisticTransactionDB - org.rocksdb.OptimisticTransactionOptions - org.rocksdb.Options - org.rocksdb.OptionsUtil - org.rocksdb.PersistentCache - org.rocksdb.PlainTableConfig - org.rocksdb.RateLimiter - org.rocksdb.ReadOptions - org.rocksdb.RemoveEmptyValueCompactionFilter - org.rocksdb.RestoreOptions - org.rocksdb.RocksCallbackObject - org.rocksdb.RocksDB - org.rocksdb.RocksEnv - org.rocksdb.RocksIterator - org.rocksdb.RocksIteratorInterface - org.rocksdb.RocksMemEnv - org.rocksdb.RocksMutableObject - org.rocksdb.RocksObject - org.rocksdb.SkipListMemTableConfig - org.rocksdb.Slice - org.rocksdb.Snapshot - org.rocksdb.SstFileManager - org.rocksdb.SstFileWriter - org.rocksdb.Statistics - org.rocksdb.StringAppendOperator - org.rocksdb.TableFormatConfig - org.rocksdb.ThreadStatus - org.rocksdb.TimedEnv - org.rocksdb.Transaction - org.rocksdb.TransactionDB - org.rocksdb.TransactionDBOptions - org.rocksdb.TransactionLogIterator - org.rocksdb.TransactionOptions - org.rocksdb.TtlDB - org.rocksdb.UInt64AddOperator - org.rocksdb.VectorMemTableConfig - org.rocksdb.WBWIRocksIterator - org.rocksdb.WriteBatch - org.rocksdb.WriteBatch.Handler - org.rocksdb.WriteBatchInterface - org.rocksdb.WriteBatchWithIndex - org.rocksdb.WriteOptions - org.rocksdb.NativeComparatorWrapperTest - org.rocksdb.RocksDBExceptionTest - org.rocksdb.SnapshotTest - org.rocksdb.WriteBatchTest - org.rocksdb.WriteBatchTestInternalHelper - org.rocksdb.WriteBufferManager -) - -include(FindJava) -include(UseJava) -include(FindJNI) - -include_directories(${JNI_INCLUDE_DIRS}) -include_directories(${PROJECT_SOURCE_DIR}/java) - -set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) -set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) -set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) -set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) -set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) -set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) -set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) -set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR}) - -add_jar( - rocksdbjni_classes - SOURCES - src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java +set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/AbstractCompactionFilter.java + src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java src/main/java/org/rocksdb/AbstractComparator.java src/main/java/org/rocksdb/AbstractImmutableNativeReference.java src/main/java/org/rocksdb/AbstractMutableOptions.java @@ -338,8 +226,8 @@ add_jar( src/main/java/org/rocksdb/WalProcessingOption.java src/main/java/org/rocksdb/WALRecoveryMode.java src/main/java/org/rocksdb/WBWIRocksIterator.java - src/main/java/org/rocksdb/WriteBatchInterface.java src/main/java/org/rocksdb/WriteBatch.java + src/main/java/org/rocksdb/WriteBatchInterface.java src/main/java/org/rocksdb/WriteBatchWithIndex.java src/main/java/org/rocksdb/WriteOptions.java src/main/java/org/rocksdb/WriteBufferManager.java @@ -348,6 +236,10 @@ add_jar( src/main/java/org/rocksdb/util/Environment.java src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java src/main/java/org/rocksdb/util/SizeUnit.java + src/main/java/org/rocksdb/UInt64AddOperator.java +) + +set(JAVA_TEST_CLASSES src/test/java/org/rocksdb/BackupEngineTest.java src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java src/test/java/org/rocksdb/NativeComparatorWrapperTest.java @@ -355,13 +247,59 @@ add_jar( src/test/java/org/rocksdb/RocksDBExceptionTest.java src/test/java/org/rocksdb/RocksMemoryResource.java src/test/java/org/rocksdb/SnapshotTest.java - src/main/java/org/rocksdb/UInt64AddOperator.java src/test/java/org/rocksdb/WriteBatchTest.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java src/test/java/org/rocksdb/util/WriteBatchGetter.java - INCLUDE_JARS ${JAVA_TESTCLASSPATH} ) +include(FindJava) +include(UseJava) +find_package(JNI) + +include_directories(${JNI_INCLUDE_DIRS}) +include_directories(${PROJECT_SOURCE_DIR}/java) + +set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) +set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) +set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) +set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) +set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) +set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) +set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) +set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR}) + +set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include) +file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR}) + +if(${Java_VERSION_MAJOR} VERSION_GREATER_EQUAL "10" AND ${CMAKE_VERSION} VERSION_LESS "3.11.4") + # Java 10 and newer don't have javah, but the alternative GENERATE_NATIVE_HEADERS requires CMake 3.11.4 or newer + message(FATAL_ERROR "Detected Java 10 or newer (${Java_VERSION_STRING}), to build with CMake please upgrade CMake to 3.11.4 or newer") + +elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1")) + # Old CMake or Java 1.7 prepare the JAR... + message("Preparing Jar for Java 7") + add_jar( + rocksdbjni_classes + SOURCES + ${JAVA_MAIN_CLASSES} + ${JAVA_TEST_CLASSES} + INCLUDE_JARS ${JAVA_TESTCLASSPATH} + ) + +else () + # Java 1.8 or newer prepare the JAR... + message("Preparing Jar for JDK ${Java_VERSION_STRING}") + add_jar( + rocksdbjni_classes + SOURCES + ${JAVA_MAIN_CLASSES} + ${JAVA_TEST_CLASSES} + INCLUDE_JARS ${JAVA_TESTCLASSPATH} + GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR} + ) + +endif() + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes) file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes) endif() @@ -424,15 +362,114 @@ if(NOT EXISTS ${JAVA_ASSERTJ_JAR}) file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR}) endif() -set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include) +if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1")) + # Old CMake or Java 1.7 ONLY generate JNI headers, Java 1.8+ JNI is handled in add_jar step above + message("Preparing JNI headers for Java 7") + set(NATIVE_JAVA_CLASSES + org.rocksdb.AbstractCompactionFilter + org.rocksdb.AbstractCompactionFilterFactory + org.rocksdb.AbstractComparator + org.rocksdb.AbstractImmutableNativeReference + org.rocksdb.AbstractNativeReference + org.rocksdb.AbstractRocksIterator + org.rocksdb.AbstractSlice + org.rocksdb.AbstractTableFilter + org.rocksdb.AbstractTraceWriter + org.rocksdb.AbstractTransactionNotifier + org.rocksdb.AbstractWalFilter + org.rocksdb.BackupableDBOptions + org.rocksdb.BackupEngine + org.rocksdb.BlockBasedTableConfig + org.rocksdb.BloomFilter + org.rocksdb.CassandraCompactionFilter + org.rocksdb.CassandraValueMergeOperator + org.rocksdb.Checkpoint + org.rocksdb.ClockCache + org.rocksdb.ColumnFamilyHandle + org.rocksdb.ColumnFamilyOptions + org.rocksdb.CompactionJobInfo + org.rocksdb.CompactionJobStats + org.rocksdb.CompactionOptions + org.rocksdb.CompactionOptionsFIFO + org.rocksdb.CompactionOptionsUniversal + org.rocksdb.CompactRangeOptions + org.rocksdb.Comparator + org.rocksdb.ComparatorOptions + org.rocksdb.CompressionOptions + org.rocksdb.DBOptions + org.rocksdb.DirectComparator + org.rocksdb.DirectSlice + org.rocksdb.Env + org.rocksdb.EnvOptions + org.rocksdb.Filter + org.rocksdb.FlushOptions + org.rocksdb.HashLinkedListMemTableConfig + org.rocksdb.HashSkipListMemTableConfig + org.rocksdb.HdfsEnv + org.rocksdb.IngestExternalFileOptions + org.rocksdb.Logger + org.rocksdb.LRUCache + org.rocksdb.MemoryUtil + org.rocksdb.MemTableConfig + org.rocksdb.NativeComparatorWrapper + org.rocksdb.NativeLibraryLoader + org.rocksdb.OptimisticTransactionDB + org.rocksdb.OptimisticTransactionOptions + org.rocksdb.Options + org.rocksdb.OptionsUtil + org.rocksdb.PersistentCache + org.rocksdb.PlainTableConfig + org.rocksdb.RateLimiter + org.rocksdb.ReadOptions + org.rocksdb.RemoveEmptyValueCompactionFilter + org.rocksdb.RestoreOptions + org.rocksdb.RocksCallbackObject + org.rocksdb.RocksDB + org.rocksdb.RocksEnv + org.rocksdb.RocksIterator + org.rocksdb.RocksIteratorInterface + org.rocksdb.RocksMemEnv + org.rocksdb.RocksMutableObject + org.rocksdb.RocksObject + org.rocksdb.SkipListMemTableConfig + org.rocksdb.Slice + org.rocksdb.Snapshot + org.rocksdb.SstFileManager + org.rocksdb.SstFileWriter + org.rocksdb.Statistics + org.rocksdb.StringAppendOperator + org.rocksdb.TableFormatConfig + org.rocksdb.ThreadStatus + org.rocksdb.TimedEnv + org.rocksdb.Transaction + org.rocksdb.TransactionDB + org.rocksdb.TransactionDBOptions + org.rocksdb.TransactionLogIterator + org.rocksdb.TransactionOptions + org.rocksdb.TtlDB + org.rocksdb.UInt64AddOperator + org.rocksdb.VectorMemTableConfig + org.rocksdb.WBWIRocksIterator + org.rocksdb.WriteBatch + org.rocksdb.WriteBatch.Handler + org.rocksdb.WriteBatchInterface + org.rocksdb.WriteBatchWithIndex + org.rocksdb.WriteOptions + org.rocksdb.NativeComparatorWrapperTest + org.rocksdb.RocksDBExceptionTest + org.rocksdb.SnapshotTest + org.rocksdb.WriteBatchTest + org.rocksdb.WriteBatchTestInternalHelper + org.rocksdb.WriteBufferManager + ) -file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR}) -create_javah( - TARGET rocksdbjni_headers - CLASSES ${NATIVE_JAVA_CLASSES} - CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH} - OUTPUT_DIR ${JNI_OUTPUT_DIR} -) + create_javah( + TARGET rocksdbjni_headers + CLASSES ${NATIVE_JAVA_CLASSES} + CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH} + OUTPUT_DIR ${JNI_OUTPUT_DIR} + ) +endif() if(NOT MSVC) set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/java/Makefile b/java/Makefile index efc9d2b4e11..7aa15bfd038 100644 --- a/java/Makefile +++ b/java/Makefile @@ -229,12 +229,20 @@ javalib: java java_test javadocs java: $(AM_V_GEN)mkdir -p $(MAIN_CLASSES) +ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) $(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\ $(MAIN_SRC)/org/rocksdb/util/*.java\ $(MAIN_SRC)/org/rocksdb/*.java +else + $(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\ + $(MAIN_SRC)/org/rocksdb/util/*.java\ + $(MAIN_SRC)/org/rocksdb/*.java +endif $(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md $(AM_V_at)@rm -f ./HISTORY-CPP.md +ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) $(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) +endif sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) @@ -276,11 +284,18 @@ resolve_test_deps: java_test: java resolve_test_deps $(AM_V_GEN)mkdir -p $(TEST_CLASSES) +ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0) $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\ $(TEST_SRC)/org/rocksdb/test/*.java\ $(TEST_SRC)/org/rocksdb/util/*.java\ $(TEST_SRC)/org/rocksdb/*.java $(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) +else + $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\ + $(TEST_SRC)/org/rocksdb/test/*.java\ + $(TEST_SRC)/org/rocksdb/util/*.java\ + $(TEST_SRC)/org/rocksdb/*.java +endif test: java java_test run_test From d43b4cd570dccf234d2a43f6acec2d5160971cc3 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 17 Jun 2019 11:03:47 -0700 Subject: [PATCH 149/572] Integrate block cache tracing into db_bench (#5459) Summary: This PR integrates the block cache tracing into db_bench. It adds three command line arguments. -block_cache_trace_file (Block cache trace file path.) type: string default: "" -block_cache_trace_max_trace_file_size_in_bytes (The maximum block cache trace file size in bytes. Block cache accesses will not be logged if the trace file size exceeds this threshold. Default is 64 GB.) type: int64 default: 68719476736 -block_cache_trace_sampling_frequency (Block cache trace sampling frequency, termed s. It uses spatial downsampling and samples accesses to one out of s blocks.) type: int32 default: 1 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5459 Differential Revision: D15832031 Pulled By: HaoyuHuang fbshipit-source-id: 0ecf2f2686557251fe741a2769b21170777efa3d --- tools/db_bench_tool.cc | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b254978c5ed..a14758418c3 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -774,6 +774,17 @@ DEFINE_string(trace_file, "", "Trace workload to a file. "); DEFINE_int32(trace_replay_fast_forward, 1, "Fast forward trace replay, must >= 1. "); +DEFINE_int32(block_cache_trace_sampling_frequency, 1, + "Block cache trace sampling frequency, termed s. It uses spatial " + "downsampling and samples accesses to one out of s blocks."); +DEFINE_int64( + block_cache_trace_max_trace_file_size_in_bytes, + uint64_t{64} * 1024 * 1024 * 1024, + "The maximum block cache trace file size in bytes. Block cache accesses " + "will not be logged if the trace file size exceeds this threshold. Default " + "is 64 GB."); +DEFINE_string(block_cache_trace_file, "", "Block cache trace file path."); + static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -2081,6 +2092,7 @@ class Benchmark { Options open_options_; // keep options around to properly destroy db later #ifndef ROCKSDB_LITE TraceOptions trace_options_; + TraceOptions block_cache_trace_options_; #endif int64_t reads_; int64_t deletes_; @@ -2917,6 +2929,47 @@ class Benchmark { fprintf(stdout, "Tracing the workload to: [%s]\n", FLAGS_trace_file.c_str()); } + // Start block cache tracing. + if (!FLAGS_block_cache_trace_file.empty()) { + // Sanity checks. + if (FLAGS_block_cache_trace_sampling_frequency <= 0) { + fprintf(stderr, + "Block cache trace sampling frequency must be higher than " + "0.\n"); + exit(1); + } + if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) { + fprintf(stderr, + "The maximum file size for block cache tracing must be " + "higher than 0.\n"); + exit(1); + } + block_cache_trace_options_.max_trace_file_size = + FLAGS_block_cache_trace_max_trace_file_size_in_bytes; + block_cache_trace_options_.sampling_frequency = + FLAGS_block_cache_trace_sampling_frequency; + std::unique_ptr block_cache_trace_writer; + Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), + FLAGS_block_cache_trace_file, + &block_cache_trace_writer); + if (!s.ok()) { + fprintf(stderr, + "Encountered an error when creating trace writer, %s\n", + s.ToString().c_str()); + exit(1); + } + s = db_.db->StartBlockCacheTrace(block_cache_trace_options_, + std::move(block_cache_trace_writer)); + if (!s.ok()) { + fprintf( + stderr, + "Encountered an error when starting block cache tracing, %s\n", + s.ToString().c_str()); + exit(1); + } + fprintf(stdout, "Tracing block cache accesses to: [%s]\n", + FLAGS_block_cache_trace_file.c_str()); + } #endif // ROCKSDB_LITE if (num_warmup > 0) { @@ -2959,6 +3012,14 @@ class Benchmark { s.ToString().c_str()); } } + if (!FLAGS_block_cache_trace_file.empty()) { + Status s = db_.db->EndBlockCacheTrace(); + if (!s.ok()) { + fprintf(stderr, + "Encountered an error ending the block cache tracing, %s\n", + s.ToString().c_str()); + } + } #endif // ROCKSDB_LITE if (FLAGS_statistics) { From ee294c24ed26a7efb6688ed165328b7da68aee0d Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 17 Jun 2019 11:07:27 -0700 Subject: [PATCH 150/572] Make db_bloom_filter_test parallel (#5467) Summary: When run under TSAN it sometimes goes over 10m and times out. The slowest ones are `DBBloomFilterTestWithParam.BloomFilter` which we have 6 of them. Making the tests run in parallel should take care of the timeout issue. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5467 Differential Revision: D15856912 Pulled By: maysamyabandeh fbshipit-source-id: 26c43c55312974c1b809c070342dee037d0219f4 --- TARGETS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TARGETS b/TARGETS index 7a8bb000596..45a99a55d77 100644 --- a/TARGETS +++ b/TARGETS @@ -539,7 +539,7 @@ ROCKS_TESTS = [ [ "db_bloom_filter_test", "db/db_bloom_filter_test.cc", - "serial", + "parallel", ], [ "db_compaction_filter_test", From 671d15cbdd3839acb54cb21a2aa82efca4917155 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Mon, 17 Jun 2019 15:17:43 -0700 Subject: [PATCH 151/572] Persistent Stats: persist stats history to disk (#5046) Summary: This PR continues the work in https://github.com/facebook/rocksdb/pull/4748 and https://github.com/facebook/rocksdb/pull/4535 by adding a new DBOption `persist_stats_to_disk` which instructs RocksDB to persist stats history to RocksDB itself. When statistics is enabled, and both options `stats_persist_period_sec` and `persist_stats_to_disk` are set, RocksDB will periodically write stats to a built-in column family in the following form: key -> (timestamp in microseconds)#(stats name), value -> stats value. The existing API `GetStatsHistory` will detect the current value of `persist_stats_to_disk` and either read from in-memory data structure or from the hidden column family on disk. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5046 Differential Revision: D15863138 Pulled By: miasantreble fbshipit-source-id: bb82abdb3f2ca581aa42531734ac799f113e931b --- CMakeLists.txt | 4 +- Makefile | 4 + TARGETS | 8 +- db/db_impl/db_impl.cc | 91 ++- db/db_impl/db_impl.h | 26 +- db/db_impl/db_impl_debug.cc | 4 +- db/db_impl/db_impl_open.cc | 117 +++- db/db_options_test.cc | 265 -------- db/version_set.cc | 19 +- include/rocksdb/db.h | 3 +- include/rocksdb/options.h | 12 + include/rocksdb/stats_history.h | 4 +- {db => monitoring}/in_memory_stats_history.cc | 2 +- {db => monitoring}/in_memory_stats_history.h | 2 +- monitoring/persistent_stats_history.cc | 171 ++++++ monitoring/persistent_stats_history.h | 83 +++ monitoring/stats_history_test.cc | 576 ++++++++++++++++++ options/db_options.cc | 5 +- options/db_options.h | 1 + options/options.cc | 1 - options/options_helper.cc | 5 + options/options_settable_test.cc | 1 + options/options_test.cc | 2 + src.mk | 66 +- tools/db_bench_tool.cc | 3 + 25 files changed, 1143 insertions(+), 332 deletions(-) rename {db => monitoring}/in_memory_stats_history.cc (97%) rename {db => monitoring}/in_memory_stats_history.h (98%) create mode 100644 monitoring/persistent_stats_history.cc create mode 100644 monitoring/persistent_stats_history.h create mode 100644 monitoring/stats_history_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index eda1281e149..7ff61dca99f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -521,7 +521,6 @@ set(SOURCES db/flush_scheduler.cc db/forward_iterator.cc db/internal_stats.cc - db/in_memory_stats_history.cc db/logs_with_prep_tracker.cc db/log_reader.cc db/log_writer.cc @@ -568,10 +567,12 @@ set(SOURCES memtable/write_buffer_manager.cc monitoring/histogram.cc monitoring/histogram_windowing.cc + monitoring/in_memory_stats_history.cc monitoring/instrumented_mutex.cc monitoring/iostats_context.cc monitoring/perf_context.cc monitoring/perf_level.cc + monitoring/persistent_stats_history.cc monitoring/statistics.cc monitoring/thread_status_impl.cc monitoring/thread_status_updater.cc @@ -955,6 +956,7 @@ if(WITH_TESTS) monitoring/histogram_test.cc monitoring/iostats_context_test.cc monitoring/statistics_test.cc + monitoring/stats_history_test.cc options/options_settable_test.cc options/options_test.cc table/block_based/block_based_filter_block_test.cc diff --git a/Makefile b/Makefile index 5944325aafe..a499cbbedd7 100644 --- a/Makefile +++ b/Makefile @@ -548,6 +548,7 @@ TESTS = \ ldb_cmd_test \ persistent_cache_test \ statistics_test \ + stats_history_test \ lru_cache_test \ object_registry_test \ repair_test \ @@ -1566,6 +1567,9 @@ persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o db/db statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 45a99a55d77..a43ed6b1085 100644 --- a/TARGETS +++ b/TARGETS @@ -113,7 +113,6 @@ cpp_library( "db/flush_job.cc", "db/flush_scheduler.cc", "db/forward_iterator.cc", - "db/in_memory_stats_history.cc", "db/internal_stats.cc", "db/log_reader.cc", "db/log_writer.cc", @@ -163,10 +162,12 @@ cpp_library( "memtable/write_buffer_manager.cc", "monitoring/histogram.cc", "monitoring/histogram_windowing.cc", + "monitoring/in_memory_stats_history.cc", "monitoring/instrumented_mutex.cc", "monitoring/iostats_context.cc", "monitoring/perf_context.cc", "monitoring/perf_level.cc", + "monitoring/persistent_stats_history.cc", "monitoring/statistics.cc", "monitoring/thread_status_impl.cc", "monitoring/thread_status_updater.cc", @@ -971,6 +972,11 @@ ROCKS_TESTS = [ "monitoring/statistics_test.cc", "serial", ], + [ + "stats_history_test", + "monitoring/stats_history_test.cc", + "serial", + ], [ "stringappend_test", "utilities/merge_operators/string_append/stringappend_test.cc", diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 154e6dd2339..21b8f3d9165 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -34,7 +34,6 @@ #include "db/external_sst_file_ingestion_job.h" #include "db/flush_job.h" #include "db/forward_iterator.h" -#include "db/in_memory_stats_history.h" #include "db/job_context.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -58,8 +57,10 @@ #include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "memtable/hash_skiplist_rep.h" +#include "monitoring/in_memory_stats_history.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" #include "options/cf_options.h" @@ -98,6 +99,9 @@ namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); +const int kMicrosInSecond = 1000 * 1000; void DumpRocksDBBuildVersion(Logger* log); CompressionType GetCompressionFlush( @@ -162,6 +166,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, logfile_number_(0), log_dir_synced_(false), log_empty_(true), + persist_stats_cf_handle_(nullptr), log_sync_cv_(&mutex_), total_log_size_(0), is_snapshot_supported_(true), @@ -482,10 +487,17 @@ Status DBImpl::CloseHelper() { } } - if (default_cf_handle_ != nullptr) { + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { // we need to delete handle outside of lock because it does its own locking mutex_.Unlock(); - delete default_cf_handle_; + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } mutex_.Lock(); } @@ -634,7 +646,7 @@ void DBImpl::StartTimedTasks() { if (!thread_dump_stats_) { thread_dump_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - stats_dump_period_sec * 1000000)); + static_cast(stats_dump_period_sec) * kMicrosInSecond)); } } stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec; @@ -642,14 +654,14 @@ void DBImpl::StartTimedTasks() { if (!thread_persist_stats_) { thread_persist_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - stats_persist_period_sec * 1000000)); + static_cast(stats_persist_period_sec) * kMicrosInSecond)); } } } } // esitmate the total size of stats_history_ -size_t DBImpl::EstiamteStatsHistorySize() const { +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { size_t size_total = sizeof(std::map>); if (stats_history_.size() == 0) return size_total; @@ -671,7 +683,7 @@ void DBImpl::PersistStats() { if (shutdown_initiated_) { return; } - uint64_t now_micros = env_->NowMicros(); + uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; Statistics* statistics = immutable_db_options_.statistics.get(); if (!statistics) { return; @@ -682,12 +694,40 @@ void DBImpl::PersistStats() { stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; } - // TODO(Zhongyi): also persist immutable_db_options_.statistics - { - std::map stats_map; - if (!statistics->getTickerMap(&stats_map)) { - return; + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + if (stats_slice_initialized_) { + for (const auto& stat : stats_map) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)), + ToString(delta)); + } + } } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + Status s = Write(wo, &batch); + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s\n", + s.ToString().c_str()); + } + // TODO(Zhongyi): add purging for persisted data + } else { InstrumentedMutexLock l(&stats_history_mutex_); // calculate the delta from last time if (stats_slice_initialized_) { @@ -697,17 +737,19 @@ void DBImpl::PersistStats() { stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; } } - stats_history_[now_micros] = stats_delta; + stats_history_[now_seconds] = stats_delta; } stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); // delete older stats snapshots to control memory consumption - bool purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit; + bool purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; while (purge_needed && !stats_history_.empty()) { stats_history_.erase(stats_history_.begin()); - purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit; + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; } } // TODO: persist stats to disk @@ -741,8 +783,13 @@ Status DBImpl::GetStatsHistory( if (!stats_iterator) { return Status::InvalidArgument("stats_iterator not preallocated."); } - stats_iterator->reset( - new InMemoryStatsHistoryIterator(start_time, end_time, this)); + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } return (*stats_iterator)->status(); } @@ -946,7 +993,8 @@ Status DBImpl::SetDBOptions( if (new_options.stats_dump_period_sec > 0) { thread_dump_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - new_options.stats_dump_period_sec * 1000000)); + static_cast(new_options.stats_dump_period_sec) * + kMicrosInSecond)); } else { thread_dump_stats_.reset(); } @@ -961,7 +1009,8 @@ Status DBImpl::SetDBOptions( if (new_options.stats_persist_period_sec > 0) { thread_persist_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - new_options.stats_persist_period_sec * 1000000)); + static_cast(new_options.stats_persist_period_sec) * + kMicrosInSecond)); } else { thread_persist_stats_.reset(); } @@ -1373,6 +1422,10 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { return default_cf_handle_; } +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 942c36ff6e6..e6d5a56e244 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -66,6 +66,7 @@ class Arena; class ArenaWrappedDBIter; class InMemoryStatsHistoryIterator; class MemTable; +class PersistentStatsHistoryIterator; class TableCache; class TaskLimiterToken; class Version; @@ -268,6 +269,8 @@ class DBImpl : public DB { ColumnFamilyHandle* DefaultColumnFamily() const override; + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + virtual Status Close() override; Status GetStatsHistory( @@ -822,7 +825,7 @@ class DBImpl : public DB { void TEST_WaitForDumpStatsRun(std::function callback) const; void TEST_WaitForPersistStatsRun(std::function callback) const; bool TEST_IsPersistentStatsEnabled() const; - size_t TEST_EstiamteStatsHistorySize() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; #endif // NDEBUG @@ -1016,6 +1019,7 @@ class DBImpl : public DB { friend class DBTest_MixedSlowdownOptionsStop_Test; friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test; @@ -1176,6 +1180,21 @@ class DBImpl : public DB { PrepickedCompaction* prepicked_compaction; }; + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); + Status ResumeImpl(); void MaybeIgnoreError(Status* s) const; @@ -1424,7 +1443,7 @@ class DBImpl : public DB { void PrintStatistics(); - size_t EstiamteStatsHistorySize() const; + size_t EstimateInMemoryStatsHistorySize() const; // persist stats to column family "_persistent_stats" void PersistStats(); @@ -1571,6 +1590,9 @@ class DBImpl : public DB { // expesnive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; + ColumnFamilyHandleImpl* persist_stats_cf_handle_; + + bool persistent_stats_cfd_exists_ = true; // Without two_write_queues, read and writes to alive_log_files_ are // protected by mutex_. However since back() is never popped, and push_back() diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 4b558facb37..ec1e1b47752 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -262,8 +262,8 @@ bool DBImpl::TEST_IsPersistentStatsEnabled() const { return thread_persist_stats_ && thread_persist_stats_->IsRunning(); } -size_t DBImpl::TEST_EstiamteStatsHistorySize() const { - return EstiamteStatsHistorySize(); +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); } } // namespace rocksdb #endif // NDEBUG diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index baa4fe707aa..eec7cf16aa7 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -13,6 +13,7 @@ #include "db/builder.h" #include "db/error_handler.h" #include "file/sst_file_manager_impl.h" +#include "monitoring/persistent_stats_history.h" #include "options/options_helper.h" #include "rocksdb/wal_filter.h" #include "table/block_based/block_based_table_factory.h" @@ -375,6 +376,7 @@ Status DBImpl::Recover( } Status s = versions_->Recover(column_families, read_only); + if (immutable_db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } @@ -386,6 +388,10 @@ Status DBImpl::Recover( } } } + // DB mutex is already held + if (s.ok() && immutable_db_options_.persist_stats_to_disk) { + s = InitPersistStatsColumnFamily(); + } // Initial max_total_in_memory_state_ before recovery logs. Log recovery // may check this value to decide whether to flush. @@ -401,6 +407,8 @@ Status DBImpl::Recover( default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + // TODO(Zhongyi): handle single_column_family_mode_ when + // persistent_stats is enabled single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; @@ -496,6 +504,98 @@ Status DBImpl::Recover( return s; } +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Reading persistent stats version key failed. Format key: %s, " + "compatible key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Disable persistent stats due to corrupted or incompatible format " + "version\n"); + } + DropColumnFamily(persist_stats_cf_handle_); + DestroyColumnFamilyHandle(persist_stats_cf_handle_); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + if (s.ok() && should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + ToString(kStatsCFCurrentFormatVersion)); + batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + ToString(kStatsCFCompatibleFormatVersion)); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + // REQUIRES: log_numbers are sorted in ascending order Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only) { @@ -1065,12 +1165,23 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { std::vector column_families; column_families.push_back( ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } std::vector handles; Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { - assert(handles.size() == 1); + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } // i can delete the handle since DBImpl is always holding a reference to // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } delete handles[0]; } return s; @@ -1247,6 +1358,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->directories_.GetDbDir()->Fsync(); } } + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version but no need to fail Open() even if it fails + s = impl->PersistentStatsProcessFormatVersion(); + } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { diff --git a/db/db_options_test.cc b/db/db_options_test.cc index bf33153284e..7dd672646b5 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -518,114 +518,6 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { Close(); } -TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) { - Options options; - options.create_if_missing = true; - options.stats_dump_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - int counter = 0; - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DumpStats:1", [&](void* /*arg*/) { - counter++; - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec); - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - - // Test cacel job through SetOptions - ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); - int old_val = counter; - for (int i = 6; i < 20; ++i) { - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); }); - } - ASSERT_EQ(counter, old_val); - Close(); -} - -// Test persistent stats background thread scheduling and cancelling -TEST_F(DBOptionsTest, StatsPersistScheduling) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - int counter = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - - // Test cacel job through SetOptions - ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled()); - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); - ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled()); - Close(); -} - -// Test enabling persistent stats for the first time -TEST_F(DBOptionsTest, PersistentStatsFreshInstall) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 0; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - int counter = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}})); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - Close(); -} - TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { Options options; options.create_if_missing = true; @@ -640,163 +532,6 @@ TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec); } -TEST_F(DBOptionsTest, GetStatsHistory) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = rocksdb::CreateDBStatistics(); - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG - - CreateColumnFamilies({"pikachu"}, options); - ASSERT_OK(Put("foo", "bar")); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - int mock_time = 1; - // Wait for stats persist to finish - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - std::unique_ptr stats_iter; - db_->GetStatsHistory(0, 6 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - // disabled stats snapshots - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); - size_t stats_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - auto stats_map = stats_iter->GetStatsMap(); - stats_count += stats_map.size(); - } - ASSERT_GT(stats_count, 0); - // Wait a bit and verify no more stats are found - for (mock_time = 6; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count_new = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - stats_count_new += stats_iter->GetStatsMap().size(); - } - ASSERT_EQ(stats_count_new, stats_count); - Close(); -} - -TEST_F(DBOptionsTest, InMemoryStatsHistoryPurging) { - Options options; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.stats_persist_period_sec = 1; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG - - CreateColumnFamilies({"pikachu"}, options); - ASSERT_OK(Put("foo", "bar")); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - // some random operation to populate statistics - ASSERT_OK(Delete("foo")); - ASSERT_OK(Put("sol", "sol")); - ASSERT_OK(Put("epic", "epic")); - ASSERT_OK(Put("ltd", "ltd")); - ASSERT_EQ("sol", Get("sol")); - ASSERT_EQ("epic", Get("epic")); - ASSERT_EQ("ltd", Get("ltd")); - Iterator* iterator = db_->NewIterator(ReadOptions()); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - ASSERT_TRUE(iterator->key() == iterator->value()); - } - delete iterator; - ASSERT_OK(Flush()); - ASSERT_OK(Delete("sol")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - int mock_time = 1; - // Wait for stats persist to finish - for (; mock_time < 5; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - - // second round of ops - ASSERT_OK(Put("saigon", "saigon")); - ASSERT_OK(Put("noodle talk", "noodle talk")); - ASSERT_OK(Put("ping bistro", "ping bistro")); - iterator = db_->NewIterator(ReadOptions()); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - ASSERT_TRUE(iterator->key() == iterator->value()); - } - delete iterator; - ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - for (; mock_time < 10; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - std::unique_ptr stats_iter; - db_->GetStatsHistory(0, 10 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count = 0; - int slice_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - slice_count++; - auto stats_map = stats_iter->GetStatsMap(); - stats_count += stats_map.size(); - } - size_t stats_history_size = dbfull()->TEST_EstiamteStatsHistorySize(); - ASSERT_GE(slice_count, 9); - ASSERT_GE(stats_history_size, 12000); - // capping memory cost at 12000 bytes since one slice is around 10000~12000 - ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}})); - ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size); - // Wait for stats persist to finish - for (; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count_reopen = 0; - slice_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - slice_count++; - auto stats_map = stats_iter->GetStatsMap(); - stats_count_reopen += stats_map.size(); - } - size_t stats_history_size_reopen = dbfull()->TEST_EstiamteStatsHistorySize(); - // only one slice can fit under the new stats_history_buffer_size - ASSERT_LT(slice_count, 2); - ASSERT_TRUE(stats_history_size_reopen < 12000 && - stats_history_size_reopen > 0); - ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); - Close(); -} - static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) { dbfull->TEST_LockMutex(); JobContext job_context(0); diff --git a/db/version_set.cc b/db/version_set.cc index 30fc744c98a..ccedca7940d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,10 +9,10 @@ #include "db/version_set.h" -#include #include #include #include +#include #include #include #include @@ -32,6 +32,7 @@ #include "file/filename.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/write_buffer_manager.h" @@ -3962,11 +3963,23 @@ Status VersionSet::ApplyOneVersionEditToBuilder( edit.column_family_name_); } auto cf_options = name_to_options.find(edit.column_family_name_); - if (cf_options == name_to_options.end()) { + // implicitly add persistent_stats column family without requiring user + // to specify + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options.end() && + !is_persistent_stats_column_family) { column_families_not_found.insert( {edit.column_family_, edit.column_family_name_}); } else { - cfd = CreateColumnFamily(cf_options->second, &edit); + // recover persistent_stats CF from a DB that already contains it + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + cfd = CreateColumnFamily(cfo, &edit); + } else { + cfd = CreateColumnFamily(cf_options->second, &edit); + } cfd->set_initialized(); builders.insert(std::make_pair( edit.column_family_, std::unique_ptr( diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 3a32d6f82bd..0f8573e4319 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -59,6 +59,7 @@ class CompactionJobInfo; #endif extern const std::string kDefaultColumnFamilyName; +extern const std::string kPersistentStatsColumnFamilyName; struct ColumnFamilyDescriptor { std::string name; ColumnFamilyOptions options; @@ -1335,7 +1336,7 @@ class DB { // Given a window [start_time, end_time), setup a StatsHistoryIterator // to access stats history. Note the start_time and end_time are epoch - // time measured in microsecond, and end_time is an exclusive bound. + // time measured in seconds, and end_time is an exclusive bound. virtual Status GetStatsHistory( uint64_t /*start_time*/, uint64_t /*end_time*/, std::unique_ptr* /*stats_iterator*/) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 307582fe678..fe5617fb5c3 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -694,6 +694,18 @@ struct DBOptions { // Default: 600 unsigned int stats_persist_period_sec = 600; + // If true, automatically persist stats to a hidden column family (column + // family name: ___rocksdb_stats_history___) every + // stats_persist_period_sec seconds; otherwise, write to an in-memory + // struct. User can query through `GetStatsHistory` API. + // If user attempts to create a column family with the same name on a DB + // which have previously set persist_stats_to_disk to true, the column family + // creation will fail, but the hidden column family will survive, as well as + // the previously persisted statistics. + // When peristing stats to disk, the stat name will be limited at 100 bytes. + // Default: false + bool persist_stats_to_disk = false; + // if not zero, periodically take stats snapshots and store in memory, the // memory size for stats snapshots is capped at stats_history_buffer_size // Default: 1MB diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h index 1a841908170..c6634ae68aa 100644 --- a/include/rocksdb/stats_history.h +++ b/include/rocksdb/stats_history.h @@ -49,10 +49,12 @@ class StatsHistoryIterator { // REQUIRES: Valid() virtual void Next() = 0; - // Return the time stamp (in microseconds) when stats history is recorded. + // Return the time stamp (in seconds) when stats history is recorded. // REQUIRES: Valid() virtual uint64_t GetStatsTime() const = 0; + virtual int GetFormatVersion() const { return -1; } + // Return the current stats history as an std::map which specifies the // mapping from stats name to stats value . The underlying storage // for the returned map is valid only until the next modification of diff --git a/db/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc similarity index 97% rename from db/in_memory_stats_history.cc rename to monitoring/in_memory_stats_history.cc index 41fdb71c8c1..22ecde0ab6c 100644 --- a/db/in_memory_stats_history.cc +++ b/monitoring/in_memory_stats_history.cc @@ -6,7 +6,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/in_memory_stats_history.h" +#include "monitoring/in_memory_stats_history.h" #include "db/db_impl/db_impl.h" namespace rocksdb { diff --git a/db/in_memory_stats_history.h b/monitoring/in_memory_stats_history.h similarity index 98% rename from db/in_memory_stats_history.h rename to monitoring/in_memory_stats_history.h index eeb679cc0a2..8ccec146a96 100644 --- a/db/in_memory_stats_history.h +++ b/monitoring/in_memory_stats_history.h @@ -25,7 +25,7 @@ namespace rocksdb { class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { public: // Setup InMemoryStatsHistoryIterator to return stats snapshots between - // microsecond timestamps [start_time, end_time) + // seconds timestamps [start_time, end_time) InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time, DBImpl* db_impl) : start_time_(start_time), diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc new file mode 100644 index 00000000000..c1704f56747 --- /dev/null +++ b/monitoring/persistent_stats_history.cc @@ -0,0 +1,171 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/persistent_stats_history.h" + +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "port/likely.h" +#include "util/string_util.h" + +namespace rocksdb { +// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286] +const int kNowSecondsStringLength = 10; +const std::string kFormatVersionKeyString = + "__persistent_stats_format_version__"; +const std::string kCompatibleVersionKeyString = + "__persistent_stats_compatible_version__"; +// Every release maintains two versions numbers for persistents stats: Current +// format version and compatible format version. Current format version +// designates what type of encoding will be used when writing to stats CF; +// compatible format version designates the minimum format version that +// can decode the stats CF encoded using the current format version. +const uint64_t kStatsCFCurrentFormatVersion = 1; +const uint64_t kStatsCFCompatibleFormatVersion = 1; + +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number) { + if (type >= StatsVersionKeyType::kKeyTypeMax) { + return Status::InvalidArgument("Invalid stats version key type provided"); + } + std::string key; + if (type == StatsVersionKeyType::kFormatVersion) { + key = kFormatVersionKeyString; + } else if (type == StatsVersionKeyType::kCompatibleVersion) { + key = kCompatibleVersionKeyString; + } + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result); + if (!s.ok() || result.empty()) { + return Status::NotFound("Persistent stats version key " + key + + " not found."); + } + + // read version_number but do nothing in current version + *version_number = ParseUint64(result); + return Status::OK(); +} + +int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key, + int size, char* buf) { + char timestamp[kNowSecondsStringLength + 1]; + // make time stamp string equal in length to allow sorting by time + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(now_seconds)); + timestamp[kNowSecondsStringLength] = '\0'; + return snprintf(buf, size, "%s#%s", timestamp, key.c_str()); +} + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) { + cfo->write_buffer_size = 2 << 20; + cfo->target_file_size_base = 2 * 1048576; + cfo->max_bytes_for_level_base = 10 * 1048576; + cfo->snap_refresh_nanos = 0; + cfo->soft_pending_compaction_bytes_limit = 256 * 1048576; + cfo->hard_pending_compaction_bytes_limit = 1073741824ul; + cfo->compression = kNoCompression; +} + +PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {} + +bool PersistentStatsHistoryIterator::Valid() const { return valid_; } + +Status PersistentStatsHistoryIterator::status() const { return status_; } + +void PersistentStatsHistoryIterator::Next() { + // increment start_time by 1 to avoid infinite loop + AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); +} + +uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; } + +const std::map& +PersistentStatsHistoryIterator::GetStatsMap() const { + return stats_map_; +} + +std::pair parseKey(const Slice& key, + uint64_t start_time) { + std::pair result; + std::string key_str = key.ToString(); + std::string::size_type pos = key_str.find("#"); + // TODO(Zhongyi): add counters to track parse failures? + if (pos == std::string::npos) { + result.first = port::kMaxUint64; + result.second.clear(); + } else { + uint64_t parsed_time = ParseUint64(key_str.substr(0, pos)); + // skip entries with timestamp smaller than start_time + if (parsed_time < start_time) { + result.first = port::kMaxUint64; + result.second = ""; + } else { + result.first = parsed_time; + std::string key_resize = key_str.substr(pos + 1); + result.second = key_resize; + } + } + return result; +} + +// advance the iterator to the next time between [start_time, end_time) +// if success, update time_ and stats_map_ with new_time and stats_map +void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, + uint64_t end_time) { + // try to find next entry in stats_history_ map + if (db_impl_ != nullptr) { + ReadOptions ro; + Iterator* iter = + db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily()); + + char timestamp[kNowSecondsStringLength + 1]; + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(std::max(time_, start_time))); + timestamp[kNowSecondsStringLength] = '\0'; + + iter->Seek(timestamp); + // no more entries with timestamp >= start_time is found or version key + // is found to be incompatible + if (!iter->Valid()) { + valid_ = false; + delete iter; + return; + } + time_ = parseKey(iter->key(), start_time).first; + valid_ = true; + // check parsed time and invalid if it exceeds end_time + if (time_ > end_time) { + valid_ = false; + delete iter; + return; + } + // find all entries with timestamp equal to time_ + std::map new_stats_map; + std::pair kv; + for (; iter->Valid(); iter->Next()) { + kv = parseKey(iter->key(), start_time); + if (kv.first != time_) { + break; + } + if (kv.second.compare(kFormatVersionKeyString) == 0) { + continue; + } + new_stats_map[kv.second] = ParseUint64(iter->value().ToString()); + } + stats_map_.swap(new_stats_map); + delete iter; + } else { + valid_ = false; + } +} + +} // namespace rocksdb diff --git a/monitoring/persistent_stats_history.h b/monitoring/persistent_stats_history.h new file mode 100644 index 00000000000..9a6885987fd --- /dev/null +++ b/monitoring/persistent_stats_history.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/db_impl/db_impl.h" +#include "rocksdb/stats_history.h" + +namespace rocksdb { + +extern const std::string kFormatVersionKeyString; +extern const std::string kCompatibleVersionKeyString; +extern const uint64_t kStatsCFCurrentFormatVersion; +extern const uint64_t kStatsCFCompatibleFormatVersion; + +enum StatsVersionKeyType : uint32_t { + kFormatVersion = 1, + kCompatibleVersion = 2, + kKeyTypeMax = 3 +}; + +// Read the version number from persitent stats cf depending on type provided +// stores the version number in `*version_number` +// returns Status::OK() on success, or other status code on failure +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number); + +// Encode timestamp and stats key into buf +// Format: timestamp(10 digit) + '#' + key +// Total length of encoded key will be capped at 100 bytes +int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key, + int size, char* buf); + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo); + +class PersistentStatsHistoryIterator final : public StatsHistoryIterator { + public: + PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time, + DBImpl* db_impl) + : time_(0), + start_time_(start_time), + end_time_(end_time), + valid_(true), + db_impl_(db_impl) { + AdvanceIteratorByTime(start_time_, end_time_); + } + ~PersistentStatsHistoryIterator() override; + bool Valid() const override; + Status status() const override; + + void Next() override; + uint64_t GetStatsTime() const override; + + const std::map& GetStatsMap() const override; + + private: + // advance the iterator to the next stats history record with timestamp + // between [start_time, end_time) + void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time); + + // No copying allowed + PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) = + delete; + void operator=(const PersistentStatsHistoryIterator&) = delete; + PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete; + PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) = + delete; + + uint64_t time_; + uint64_t start_time_; + uint64_t end_time_; + std::map stats_map_; + Status status_; + bool valid_; + DBImpl* db_impl_; +}; + +} // namespace rocksdb diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc new file mode 100644 index 00000000000..a66043da1fe --- /dev/null +++ b/monitoring/stats_history_test.cc @@ -0,0 +1,576 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/stats_history.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace rocksdb { + +class StatsHistoryTest : public DBTestBase { + public: + StatsHistoryTest() : DBTestBase("/stats_history_test") {} +}; + +TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_dump_period_sec = 5; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + int counter = 0; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec); + dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + + // Test cacel job through SetOptions + ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); + int old_val = counter; + for (int i = 6; i < 20; ++i) { + dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); }); + } + ASSERT_EQ(counter, old_val); + Close(); +} + +// Test persistent stats background thread scheduling and cancelling +TEST_F(StatsHistoryTest, StatsPersistScheduling) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + int counter = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + + // Test cacel job through SetOptions + ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled()); + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); + ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled()); + Close(); +} + +// Test enabling persistent stats for the first time +TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 0; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + int counter = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}})); + ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + Close(); +} + +// TODO(Zhongyi): Move persistent stats related tests to a separate file +TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + int mock_time = 1; + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + // disabled stats snapshots + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); + size_t stats_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5); + stats_count += stats_map.size(); + } + ASSERT_GT(stats_count, 0); + // Wait a bit and verify no more stats are found + for (mock_time = 6; mock_time < 20; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_new = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + stats_count_new += stats_iter->GetStatsMap().size(); + } + ASSERT_EQ(stats_count_new, stats_count); + Close(); +} + +TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { + Options options; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.stats_persist_period_sec = 1; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // some random operation to populate statistics + ASSERT_OK(Delete("foo")); + ASSERT_OK(Put("sol", "sol")); + ASSERT_OK(Put("epic", "epic")); + ASSERT_OK(Put("ltd", "ltd")); + ASSERT_EQ("sol", Get("sol")); + ASSERT_EQ("epic", Get("epic")); + ASSERT_EQ("ltd", Get("ltd")); + Iterator* iterator = db_->NewIterator(ReadOptions()); + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_TRUE(iterator->key() == iterator->value()); + } + delete iterator; + ASSERT_OK(Flush()); + ASSERT_OK(Delete("sol")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + int mock_time = 1; + // Wait for stats persist to finish + for (; mock_time < 5; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + + // second round of ops + ASSERT_OK(Put("saigon", "saigon")); + ASSERT_OK(Put("noodle talk", "noodle talk")); + ASSERT_OK(Put("ping bistro", "ping bistro")); + iterator = db_->NewIterator(ReadOptions()); + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_TRUE(iterator->key() == iterator->value()); + } + delete iterator; + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + for (; mock_time < 10; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count = 0; + int slice_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + stats_count += stats_map.size(); + } + size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); + ASSERT_GE(slice_count, 9); + ASSERT_GE(stats_history_size, 12000); + // capping memory cost at 12000 bytes since one slice is around 10000~12000 + ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}})); + ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size); + // Wait for stats persist to finish + for (; mock_time < 20; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_reopen = 0; + slice_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + stats_count_reopen += stats_map.size(); + } + size_t stats_history_size_reopen = + dbfull()->TEST_EstimateInMemoryStatsHistorySize(); + // only one slice can fit under the new stats_history_buffer_size + ASSERT_LT(slice_count, 2); + ASSERT_TRUE(stats_history_size_reopen < 12000 && + stats_history_size_reopen > 0); + ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); + Close(); + // TODO: may also want to verify stats timestamp to make sure we are purging + // the correct stats snapshot +} + +int countkeys(Iterator* iter) { + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + return count; +} + +TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(Get("foo"), "bar"); + + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count1 = countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count2 = countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count3 = countkeys(iter); + delete iter; + ASSERT_GE(key_count2, key_count1); + ASSERT_GE(key_count3, key_count2); + ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count = 0; + int slice_count = 0; + int non_zero_count = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (auto& stat : stats_map) { + if (stat.second != 0) { + non_zero_count++; + } + } + stats_count += stats_map.size(); + } + ASSERT_EQ(slice_count, 3); + // 2 extra keys for format version + ASSERT_EQ(stats_count, key_count3 - 2); + // verify reopen will not cause data loss + ReopenWithColumnFamilies({"default", "pikachu"}, options); + db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_reopen = 0; + int slice_count_reopen = 0; + int non_zero_count_recover = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count_reopen++; + auto stats_map = stats_iter->GetStatsMap(); + for (auto& stat : stats_map) { + if (stat.second != 0) { + non_zero_count_recover++; + } + } + stats_count_reopen += stats_map.size(); + } + ASSERT_EQ(non_zero_count, non_zero_count_recover); + ASSERT_EQ(slice_count, slice_count_reopen); + ASSERT_EQ(stats_count, stats_count_reopen); + Close(); +} + +// Test persisted stats matches the value found in options.statistics and +// the stats value retains after DB reopen +TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + std::map stats_map_before; + ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(Get("foo"), "bar"); + + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(20); }); + + std::map stats_map_after; + ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after)); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + std::string sample = "rocksdb.num.iterator.deleted"; + uint64_t recovered_value = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + recovered_value += stat.second; + } + } + } + ASSERT_EQ(recovered_value, stats_map_after[sample]); + + // test stats value retains after recovery + ReopenWithColumnFamilies({"default", "pikachu"}, options); + db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + uint64_t new_recovered_value = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + new_recovered_value += stat.second; + } + } + } + ASSERT_EQ(recovered_value, new_recovered_value); + + // TODO(Zhongyi): also add test to read raw values from disk and verify + // correctness + Close(); +} + +// TODO(Zhongyi): add test for different format versions + +TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + ASSERT_OK(TryReopen(options)); + CreateColumnFamilies({"one", "two", "three"}, options); + ASSERT_OK(Put(1, "foo", "bar")); + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + ASSERT_EQ(Get(2, "foo"), "bar"); + CreateColumnFamilies({"four"}, options); + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + ASSERT_EQ(Get(2, "foo"), "bar"); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count = countkeys(iter); + delete iter; + ASSERT_GE(key_count, 0); + uint64_t num_write_wal = 0; + std::string sample = "rocksdb.write.wal"; + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + num_write_wal += stat.second; + } + } + } + stats_iter.reset(); + ASSERT_EQ(num_write_wal, 2); + + options.persist_stats_to_disk = false; + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + int cf_count = 0; + for (auto cfd : *dbfull()->versions_->GetColumnFamilySet()) { + (void)cfd; + cf_count++; + } + // persistent stats cf will be implicitly opened even if + // persist_stats_to_disk is false + ASSERT_EQ(cf_count, 6); + ASSERT_EQ(Get(2, "foo"), "bar"); + + // attempt to create column family using same name, should fail + ColumnFamilyOptions cf_opts(options); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, + &handle)); + + options.persist_stats_to_disk = true; + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, + &handle)); + // verify stats is not affected by prior failed CF creation + db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + num_write_wal = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + num_write_wal += stat.second; + } + } + } + ASSERT_EQ(num_write_wal, 2); + + Close(); + Destroy(options); +} + +TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { + ASSERT_OK(Put("bar", "v2")); + Close(); + + auto options = CurrentOptions(); + options.stats_persist_period_sec = 5; + options.persist_stats_to_disk = true; + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v2", Get("bar")); + Close(); + + // Reopen and flush memtable. + Reopen(options); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/options/db_options.cc b/options/db_options.cc index bdcdd250a0a..490a3708030 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -84,7 +84,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) two_write_queues(options.two_write_queues), manual_wal_flush(options.manual_wal_flush), atomic_flush(options.atomic_flush), - avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) { + avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + persist_stats_to_disk(options.persist_stats_to_disk) { } void ImmutableDBOptions::Dump(Logger* log) const { @@ -222,6 +223,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.avoid_unnecessary_blocking_io: %d", avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", + persist_stats_to_disk); } MutableDBOptions::MutableDBOptions() diff --git a/options/db_options.h b/options/db_options.h index 67b26786f5e..92eea4ecfa1 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -81,6 +81,7 @@ struct ImmutableDBOptions { bool manual_wal_flush; bool atomic_flush; bool avoid_unnecessary_blocking_io; + bool persist_stats_to_disk; }; struct MutableDBOptions { diff --git a/options/options.cc b/options/options.cc index 1d2b6193cbc..5efd3ce5742 100644 --- a/options/options.cc +++ b/options/options.cc @@ -502,7 +502,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_factory.reset(new BlockBasedTableFactory(table_options)); - return this; } diff --git a/options/options_helper.cc b/options/options_helper.cc index 388256abd9f..71a7f9b2fc0 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -84,6 +84,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec; options.stats_persist_period_sec = mutable_db_options.stats_persist_period_sec; + options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk; options.stats_history_buffer_size = mutable_db_options.stats_history_buffer_size; options.advise_random_on_open = immutable_db_options.advise_random_on_open; @@ -1580,6 +1581,10 @@ std::unordered_map {offsetof(struct DBOptions, stats_persist_period_sec), OptionType::kUInt, OptionVerificationType::kNormal, true, offsetof(struct MutableDBOptions, stats_persist_period_sec)}}, + {"persist_stats_to_disk", + {offsetof(struct DBOptions, persist_stats_to_disk), + OptionType::kBoolean, OptionVerificationType::kNormal, false, + offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}}, {"stats_history_buffer_size", {offsetof(struct DBOptions, stats_history_buffer_size), OptionType::kSizeT, OptionVerificationType::kNormal, true, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 6044cc4b1c4..f0b79e372f7 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -265,6 +265,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "allow_mmap_writes=false;" "stats_dump_period_sec=70127;" "stats_persist_period_sec=54321;" + "persist_stats_to_disk=true;" "stats_history_buffer_size=14159;" "allow_fallocate=true;" "allow_mmap_reads=false;" diff --git a/options/options_test.cc b/options/options_test.cc index 9fcd241d70f..24aeec99e17 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -129,6 +129,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"skip_log_error_on_recovery", "false"}, {"stats_dump_period_sec", "46"}, {"stats_persist_period_sec", "57"}, + {"persist_stats_to_disk", "false"}, {"stats_history_buffer_size", "69"}, {"advise_random_on_open", "true"}, {"use_adaptive_mutex", "false"}, @@ -267,6 +268,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false); ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U); ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U); + ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); diff --git a/src.mk b/src.mk index 150b1c10af9..e48a6959515 100644 --- a/src.mk +++ b/src.mk @@ -36,7 +36,6 @@ LIB_SOURCES = \ db/flush_job.cc \ db/flush_scheduler.cc \ db/forward_iterator.cc \ - db/in_memory_stats_history.cc \ db/internal_stats.cc \ db/logs_with_prep_tracker.cc \ db/log_reader.cc \ @@ -86,10 +85,12 @@ LIB_SOURCES = \ memtable/write_buffer_manager.cc \ monitoring/histogram.cc \ monitoring/histogram_windowing.cc \ + monitoring/in_memory_stats_history.cc \ monitoring/instrumented_mutex.cc \ monitoring/iostats_context.cc \ monitoring/perf_context.cc \ monitoring/perf_level.cc \ + monitoring/persistent_stats_history.cc \ monitoring/statistics.cc \ monitoring/thread_status_impl.cc \ monitoring/thread_status_updater.cc \ @@ -105,21 +106,21 @@ LIB_SOURCES = \ port/port_posix.cc \ port/stack_trace.cc \ table/adaptive/adaptive_table_factory.cc \ - table/block_based/block.cc \ - table/block_based/block_based_filter_block.cc \ - table/block_based/block_based_table_builder.cc \ - table/block_based/block_based_table_factory.cc \ - table/block_based/block_based_table_reader.cc \ - table/block_based/block_builder.cc \ - table/block_based/block_prefix_index.cc \ - table/block_based/data_block_hash_index.cc \ - table/block_based/data_block_footer.cc \ - table/block_based/flush_block_policy.cc \ - table/block_based/full_filter_block.cc \ - table/block_based/index_builder.cc \ - table/block_based/partitioned_filter_block.cc \ - table/block_fetcher.cc \ - table/bloom_block.cc \ + table/block_based/block.cc \ + table/block_based/block_based_filter_block.cc \ + table/block_based/block_based_table_builder.cc \ + table/block_based/block_based_table_factory.cc \ + table/block_based/block_based_table_reader.cc \ + table/block_based/block_builder.cc \ + table/block_based/block_prefix_index.cc \ + table/block_based/data_block_hash_index.cc \ + table/block_based/data_block_footer.cc \ + table/block_based/flush_block_policy.cc \ + table/block_based/full_filter_block.cc \ + table/block_based/index_builder.cc \ + table/block_based/partitioned_filter_block.cc \ + table/block_fetcher.cc \ + table/bloom_block.cc \ table/cuckoo/cuckoo_table_builder.cc \ table/cuckoo/cuckoo_table_factory.cc \ table/cuckoo/cuckoo_table_reader.cc \ @@ -233,27 +234,27 @@ LIB_SOURCES_ASM = LIB_SOURCES_C = endif -TOOL_LIB_SOURCES = \ +TOOL_LIB_SOURCES = \ tools/ldb_cmd.cc \ tools/ldb_tool.cc \ tools/sst_dump_tool.cc \ utilities/blob_db/blob_dump_tool.cc \ -ANALYZER_LIB_SOURCES = \ +ANALYZER_LIB_SOURCES = \ tools/block_cache_trace_analyzer.cc \ - tools/trace_analyzer_tool.cc \ + tools/trace_analyzer_tool.cc \ -MOCK_LIB_SOURCES = \ - table/mock_table.cc \ +MOCK_LIB_SOURCES = \ + table/mock_table.cc \ test_util/fault_injection_test_env.cc -BENCH_LIB_SOURCES = \ +BENCH_LIB_SOURCES = \ tools/db_bench_tool.cc \ -TEST_LIB_SOURCES = \ +TEST_LIB_SOURCES = \ db/db_test_util.cc \ - test_util/testharness.cc \ - test_util/testutil.cc \ + test_util/testharness.cc \ + test_util/testutil.cc \ utilities/cassandra/test_utils.cc \ MAIN_SOURCES = \ @@ -301,7 +302,7 @@ MAIN_SOURCES = \ db/dbformat_test.cc \ db/deletefile_test.cc \ db/env_timed_test.cc \ - db/error_handler_test.cc \ + db/error_handler_test.cc \ db/external_sst_file_basic_test.cc \ db/external_sst_file_test.cc \ db/fault_injection_test.cc \ @@ -352,12 +353,13 @@ MAIN_SOURCES = \ monitoring/histogram_test.cc \ monitoring/iostats_context_test.cc \ monitoring/statistics_test.cc \ + monitoring/stats_history_test.cc \ options/options_test.cc \ - table/block_based/block_based_filter_block_test.cc \ - table/block_based/block_test.cc \ - table/block_based/data_block_hash_index_test.cc \ - table/block_based/full_filter_block_test.cc \ - table/block_based/partitioned_filter_block_test.cc \ + table/block_based/block_based_filter_block_test.cc \ + table/block_based/block_test.cc \ + table/block_based/data_block_hash_index_test.cc \ + table/block_based/full_filter_block_test.cc \ + table/block_based/partitioned_filter_block_test.cc \ table/cleanable_test.cc \ table/cuckoo/cuckoo_table_builder_test.cc \ table/cuckoo/cuckoo_table_reader_test.cc \ @@ -373,7 +375,7 @@ MAIN_SOURCES = \ tools/ldb_cmd_test.cc \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ - tools/trace_analyzer_test.cc \ + tools/trace_analyzer_test.cc \ trace_replay/block_cache_tracer_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index a14758418c3..9b3e2cac35f 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1146,6 +1146,8 @@ DEFINE_uint64(stats_dump_period_sec, rocksdb::Options().stats_dump_period_sec, DEFINE_uint64(stats_persist_period_sec, rocksdb::Options().stats_persist_period_sec, "Gap between persisting stats in seconds"); +DEFINE_bool(persist_stats_to_disk, rocksdb::Options().persist_stats_to_disk, + "whether to persist stats to disk"); DEFINE_uint64(stats_history_buffer_size, rocksdb::Options().stats_history_buffer_size, "Max number of stats snapshots to keep in memory"); @@ -3727,6 +3729,7 @@ class Benchmark { static_cast(FLAGS_stats_dump_period_sec); options.stats_persist_period_sec = static_cast(FLAGS_stats_persist_period_sec); + options.persist_stats_to_disk = FLAGS_persist_stats_to_disk; options.stats_history_buffer_size = static_cast(FLAGS_stats_history_buffer_size); From 7d8d56413dbc375cb1257306c101f99f2eb75386 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 17 Jun 2019 15:36:20 -0700 Subject: [PATCH 152/572] Override check consistency for DBImplSecondary (#5469) Summary: `DBImplSecondary` calls `CheckConsistency()` during open. In the past, `DBImplSecondary` did not override this function thus `DBImpl::CheckConsistency()` is called. The following can happen. The secondary instance is performing consistency check which calls `GetFileSize(file_path)` but the file at `file_path` is deleted by the primary instance. `DBImpl::CheckConsistency` does not account for this and fails the consistency check. This is undesirable. The solution is that, we call `DBImpl::CheckConsistency()` first. If it passes, then we are good. If not, we give it a second chance and handles the case of file(s) being deleted. Test plan (on dev server): ``` $make clean && make -j20 all $./db_secondary_test ``` All other existing unit tests must pass as well. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5469 Differential Revision: D15861845 Pulled By: riversand963 fbshipit-source-id: 507d72392508caed3cd003bb2e2aa43f993dd597 --- db/db_impl/db_impl.cc | 2 ++ db/db_impl/db_impl_secondary.cc | 38 +++++++++++++++++++++++++++++++ db/db_impl/db_impl_secondary.h | 6 +++++ db/db_impl/db_secondary_test.cc | 40 +++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 21b8f3d9165..6341b76854c 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -3017,6 +3017,7 @@ Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); std::string corruption_messages; for (const auto& md : metadata) { @@ -3024,6 +3025,7 @@ Status DBImpl::CheckConsistency() { std::string file_path = md.db_path + md.name; uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); Status s = env_->GetFileSize(file_path, &fsize); if (!s.ok() && env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 5cd0beb1f0c..8b93f675f8c 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -451,6 +451,44 @@ Status DBImplSecondary::NewIterators( return Status::OK(); } +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + Status DBImplSecondary::TryCatchUpWithPrimary() { assert(versions_.get() != nullptr); assert(manifest_reader_.get() != nullptr); diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index 24cfd33c11d..ca853e25802 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -197,6 +197,12 @@ class DBImplSecondary : public DBImpl { Status MaybeInitLogReader(uint64_t log_number, log::FragmentBufferedReader** log_reader); + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + protected: // ColumnFamilyCollector is a write batch handler which does nothing // except recording unique column family IDs diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index c9aaa361191..c79589d5022 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -705,6 +705,46 @@ TEST_F(DBSecondaryTest, CatchUpAfterFlush) { iter3->Seek("key1"); ASSERT_FALSE(iter3->Valid()); } + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} #endif //! ROCKSDB_LITE } // namespace rocksdb From 2d1dd5bce7f1c34723e55de57d8f205576cd3e75 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 17 Jun 2019 16:33:40 -0700 Subject: [PATCH 153/572] Support computing miss ratio curves using sim_cache. (#5449) Summary: This PR adds a BlockCacheTraceSimulator that reports the miss ratios given different cache configurations. A cache configuration contains "cache_name,num_shard_bits,cache_capacities". For example, "lru, 1, 1K, 2K, 4M, 4G". When we replay the trace, we also perform lookups and inserts on the simulated caches. In the end, it reports the miss ratio for each tuple in a output file. This PR also adds a main source block_cache_trace_analyzer so that we can run the analyzer in command line. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5449 Test Plan: Added tests for block_cache_trace_analyzer. COMPILE_WITH_ASAN=1 make check -j32. Differential Revision: D15797073 Pulled By: HaoyuHuang fbshipit-source-id: aef0c5c2e7938f3e8b6a10d4a6a50e6928ecf408 --- Makefile | 4 + include/rocksdb/utilities/sim_cache.h | 4 + src.mk | 1 + tools/block_cache_trace_analyzer.cc | 254 +++++++++++++++++++++-- tools/block_cache_trace_analyzer.h | 60 +++++- tools/block_cache_trace_analyzer_test.cc | 111 +++++++++- tools/block_cache_trace_analyzer_tool.cc | 25 +++ utilities/simulator_cache/sim_cache.cc | 22 +- 8 files changed, 449 insertions(+), 32 deletions(-) create mode 100644 tools/block_cache_trace_analyzer_tool.cc diff --git a/Makefile b/Makefile index a499cbbedd7..8e8c0ac7638 100644 --- a/Makefile +++ b/Makefile @@ -608,6 +608,7 @@ TOOLS = \ rocksdb_undump \ blob_dump \ trace_analyzer \ + block_cache_trace_analyzer \ TEST_LIBS = \ librocksdb_env_basic_test.a @@ -1109,6 +1110,9 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) $(AM_LINK) +block_cache_trace_analyzer: tools/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) + $(AM_LINK) + cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index bc2a7bc13d9..fef9e9910e8 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -36,6 +36,10 @@ extern std::shared_ptr NewSimCache(std::shared_ptr cache, size_t sim_capacity, int num_shard_bits); +extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits); + class SimCache : public Cache { public: SimCache() {} diff --git a/src.mk b/src.mk index e48a6959515..71c2bd01803 100644 --- a/src.mk +++ b/src.mk @@ -369,6 +369,7 @@ MAIN_SOURCES = \ table/table_test.cc \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ tools/block_cache_trace_analyzer_test.cc \ + tools/block_cache_trace_analyzer_tool.cc \ tools/db_bench.cc \ tools/db_bench_tool_test.cc \ tools/db_sanity_test.cc \ diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 5d9b2d18409..0ef4b55e46f 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -3,11 +3,44 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE +#ifdef GFLAGS #include "tools/block_cache_trace_analyzer.h" #include +#include +#include +#include #include +#include #include "monitoring/histogram.h" +#include "util/gflags_compat.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_string(block_cache_trace_path, "", "The trace file path."); +DEFINE_string( + block_cache_sim_config_path, "", + "The config file path. One cache configuration per line. The format of a " + "cache configuration is " + "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. " + "cache_name is lru. cache_capacity can be xK, xM or xG " + "where x is a positive number."); +DEFINE_bool(print_block_size_stats, false, + "Print block size distribution and the distribution break down by " + "block type and column family."); +DEFINE_bool(print_access_count_stats, false, + "Print access count distribution and the distribution break down " + "by block type and column family."); +DEFINE_bool(print_data_block_access_count_stats, false, + "Print data block accesses by user Get and Multi-Get."); +DEFINE_int32(cache_sim_warmup_seconds, 0, + "The number of seconds to warmup simulated caches. The hit/miss " + "counters are reset after the warmup completes."); +DEFINE_string(output_miss_ratio_curve_path, "", + "The output file to save the computed miss ratios. File format: " + "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"); namespace rocksdb { namespace { @@ -48,11 +81,101 @@ std::string caller_to_string(BlockCacheLookupCaller caller) { // This cannot happen. return "InvalidCaller"; } + +const char kBreakLine[] = + "***************************************************************\n"; + +void print_break_lines(uint32_t num_break_lines) { + for (uint32_t i = 0; i < num_break_lines; i++) { + fprintf(stdout, kBreakLine); + } +} + } // namespace +BlockCacheTraceSimulator::BlockCacheTraceSimulator( + uint64_t warmup_seconds, + const std::vector& cache_configurations) + : warmup_seconds_(warmup_seconds), + cache_configurations_(cache_configurations) { + for (auto const& config : cache_configurations_) { + for (auto cache_capacity : config.cache_capacities) { + sim_caches_.push_back( + NewSimCache(NewLRUCache(cache_capacity, config.num_shard_bits), + /*real_cache=*/nullptr, config.num_shard_bits)); + } + } +} + +void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { + if (trace_start_time_ == 0) { + trace_start_time_ = access.access_timestamp; + } + // access.access_timestamp is in microseconds. + if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <= + access.access_timestamp) { + for (auto& sim_cache : sim_caches_) { + sim_cache->reset_counter(); + } + warmup_complete_ = true; + } + for (auto& sim_cache : sim_caches_) { + auto handle = sim_cache->Lookup(access.block_key); + if (handle == nullptr && !access.no_insert) { + sim_cache->Insert(access.block_key, /*value=*/nullptr, access.block_size, + /*deleter=*/nullptr); + } + } +} + +void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const { + if (!cache_simulator_) { + return; + } + if (output_miss_ratio_curve_path_.empty()) { + return; + } + std::ofstream out(output_miss_ratio_curve_path_); + if (!out.is_open()) { + return; + } + // Write header. + const std::string header = + "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"; + out << header << std::endl; + uint64_t sim_cache_index = 0; + for (auto const& config : cache_simulator_->cache_configurations()) { + for (auto cache_capacity : config.cache_capacities) { + uint64_t hits = + cache_simulator_->sim_caches()[sim_cache_index]->get_hit_counter(); + uint64_t misses = + cache_simulator_->sim_caches()[sim_cache_index]->get_miss_counter(); + uint64_t total_accesses = hits + misses; + double miss_ratio = static_cast(misses * 100.0 / total_accesses); + // Write the body. + out << config.cache_name; + out << ","; + out << config.num_shard_bits; + out << ","; + out << cache_capacity; + out << ","; + out << std::fixed << std::setprecision(4) << miss_ratio; + out << ","; + out << total_accesses; + out << std::endl; + sim_cache_index++; + } + } + out.close(); +} + BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( - const std::string& trace_file_path) - : trace_file_path_(trace_file_path) { + const std::string& trace_file_path, + const std::string& output_miss_ratio_curve_path, + std::unique_ptr&& cache_simulator) + : trace_file_path_(trace_file_path), + output_miss_ratio_curve_path_(output_miss_ratio_curve_path), + cache_simulator_(std::move(cache_simulator)) { env_ = rocksdb::Env::Default(); } @@ -88,6 +211,9 @@ Status BlockCacheTraceAnalyzer::Analyze() { return s; } RecordAccess(access); + if (cache_simulator_) { + cache_simulator_->Access(access); + } } return Status::OK(); } @@ -118,6 +244,7 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { } fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str()); for (auto const& bt_stats : bt_stats_map) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Block size stats for block type %s: \n%s", block_type_to_string(bt_stats.first).c_str(), bt_stats.second.ToString().c_str()); @@ -125,6 +252,7 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { for (auto const& cf_bt_stats : cf_bt_stats_map) { const std::string& cf_name = cf_bt_stats.first; for (auto const& bt_stats : cf_bt_stats.second) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Block size stats for column family %s and block type %s: \n%s", cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), @@ -160,6 +288,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { fprintf(stdout, "Block access count stats: \n%s", access_stats.ToString().c_str()); for (auto const& bt_stats : bt_stats_map) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Block access count stats for block type %s: \n%s", block_type_to_string(bt_stats.first).c_str(), bt_stats.second.ToString().c_str()); @@ -167,6 +296,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { for (auto const& cf_bt_stats : cf_bt_stats_map) { const std::string& cf_name = cf_bt_stats.first; for (auto const& bt_stats : cf_bt_stats.second) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Block access count stats for column family %s and block type " "%s: \n%s", @@ -230,23 +360,28 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { "the total number of keys in a block: \n%s", existing_keys_stats.ToString().c_str()); for (auto const& cf_stats : cf_existing_keys_stats_map) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Break down by column family %s: \n%s", cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); } + print_break_lines(/*num_break_lines=*/1); fprintf( stdout, "Histogram on percentage of referenced keys DO NOT exist in a block over " "the total number of keys in a block: \n%s", non_existing_keys_stats.ToString().c_str()); for (auto const& cf_stats : cf_non_existing_keys_stats_map) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Break down by column family %s: \n%s", cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); } + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Histogram on percentage of accesses on keys exist in a block over " "the total number of accesses in a block: \n%s", block_access_stats.ToString().c_str()); for (auto const& cf_stats : cf_block_access_info) { + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Break down by column family %s: \n%s", cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); } @@ -318,15 +453,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { } // Print stats. - fprintf( - stdout, - "***************************************************************\n"); - fprintf( - stdout, - "***************************************************************\n"); - fprintf( - stdout, - "***************************************************************\n"); + print_break_lines(/*num_break_lines=*/3); fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str()); fprintf(stdout, "Number of files:%" PRIu64 "Number of blocks: %" PRIu64 @@ -338,9 +465,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { block_type.second); } for (auto caller : cf_caller_num_accesses_map) { - fprintf( - stdout, - "***************************************************************\n"); + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", caller_to_string(caller.first).c_str(), caller.second); fprintf(stdout, "Caller %s: Number of accesses per level break down\n", @@ -368,12 +493,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { } } } - fprintf(stdout, - "***************************************************************\n"); - fprintf(stdout, - "***************************************************************\n"); - fprintf(stdout, - "***************************************************************\n"); + print_break_lines(/*num_break_lines=*/3); fprintf(stdout, "Overall statistics:\n"); fprintf(stdout, "Number of files: %" PRIu64 " Number of blocks: %" PRIu64 @@ -384,9 +504,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { block_type_to_string(block_type.first).c_str(), block_type.second); } for (auto caller : caller_num_access_map) { - fprintf( - stdout, - "***************************************************************\n"); + print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", caller_to_string(caller.first).c_str(), caller.second); fprintf(stdout, "Caller %s: Number of accesses per level break down\n", @@ -405,4 +523,94 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { } } +std::vector parse_cache_config_file( + const std::string& config_path) { + std::ifstream file(config_path); + if (!file.is_open()) { + return {}; + } + std::vector configs; + std::string line; + while (getline(file, line)) { + CacheConfiguration cache_config; + std::stringstream ss(line); + std::vector config_strs; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + config_strs.push_back(substr); + } + // Sanity checks. + if (config_strs.size() < 3) { + fprintf(stderr, "Invalid cache simulator configuration %s\n", + line.c_str()); + exit(1); + } + if (config_strs[0] != "lru") { + fprintf(stderr, "We only support LRU cache %s\n", line.c_str()); + exit(1); + } + cache_config.cache_name = config_strs[0]; + cache_config.num_shard_bits = ParseUint32(config_strs[1]); + for (uint32_t i = 2; i < config_strs.size(); i++) { + uint64_t capacity = ParseUint64(config_strs[i]); + if (capacity == 0) { + fprintf(stderr, "Invalid cache capacity %s, %s\n", + config_strs[i].c_str(), line.c_str()); + exit(1); + } + cache_config.cache_capacities.push_back(capacity); + } + configs.push_back(cache_config); + } + file.close(); + return configs; +} + +int block_cache_trace_analyzer_tool(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_block_cache_trace_path.empty()) { + fprintf(stderr, "block cache trace path is empty\n"); + exit(1); + } + uint64_t warmup_seconds = + FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0; + std::vector cache_configs = + parse_cache_config_file(FLAGS_block_cache_sim_config_path); + std::unique_ptr cache_simulator; + if (!cache_configs.empty()) { + cache_simulator.reset( + new BlockCacheTraceSimulator(warmup_seconds, cache_configs)); + } + BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, + FLAGS_output_miss_ratio_curve_path, + std::move(cache_simulator)); + Status s = analyzer.Analyze(); + if (!s.IsIncomplete()) { + // Read all traces. + fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str()); + exit(1); + } + + analyzer.PrintStatsSummary(); + if (FLAGS_print_access_count_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintAccessCountStats(); + } + if (FLAGS_print_block_size_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintBlockSizeStats(); + } + if (FLAGS_print_data_block_access_count_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintDataBlockAccessStats(); + } + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintMissRatioCurves(); + return 0; +} + } // namespace rocksdb + +#endif // GFLAGS +#endif // ROCKSDB_LITE diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 51bb1ec7930..1420906f3cf 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -9,10 +9,56 @@ #include #include "rocksdb/env.h" +#include "rocksdb/utilities/sim_cache.h" #include "trace_replay/block_cache_tracer.h" namespace rocksdb { +class BlockCacheTraceAnalyzer; + +// A cache configuration provided by user. +struct CacheConfiguration { + std::string cache_name; // LRU. + uint32_t num_shard_bits; + std::vector + cache_capacities; // simulate cache capacities in bytes. +}; + +// A block cache simulator that reports miss ratio curves given a set of cache +// configurations. +class BlockCacheTraceSimulator { + public: + // warmup_seconds: The number of seconds to warmup simulated caches. The + // hit/miss counters are reset after the warmup completes. + BlockCacheTraceSimulator( + uint64_t warmup_seconds, + const std::vector& cache_configurations); + ~BlockCacheTraceSimulator() = default; + // No copy and move. + BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete; + BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete; + + void Access(const BlockCacheTraceRecord& access); + + const std::vector>& sim_caches() const { + return sim_caches_; + } + + const std::vector& cache_configurations() const { + return cache_configurations_; + } + + private: + const uint64_t warmup_seconds_; + const std::vector cache_configurations_; + + bool warmup_complete_ = false; + std::vector> sim_caches_; + uint64_t trace_start_time_ = 0; +}; + // Statistics of a block. struct BlockAccessInfo { uint64_t num_accesses = 0; @@ -67,7 +113,10 @@ struct ColumnFamilyAccessInfoAggregate { class BlockCacheTraceAnalyzer { public: - BlockCacheTraceAnalyzer(const std::string& trace_file_path); + BlockCacheTraceAnalyzer( + const std::string& trace_file_path, + const std::string& output_miss_ratio_curve_path, + std::unique_ptr&& cache_simulator); ~BlockCacheTraceAnalyzer() = default; // No copy and move. BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete; @@ -115,6 +164,8 @@ class BlockCacheTraceAnalyzer { // accesses on keys exist in a data block and its break down by column family. void PrintDataBlockAccessStats() const; + void PrintMissRatioCurves() const; + const std::map& TEST_cf_aggregates_map() const { return cf_aggregates_map_; @@ -124,9 +175,14 @@ class BlockCacheTraceAnalyzer { void RecordAccess(const BlockCacheTraceRecord& access); rocksdb::Env* env_; - std::string trace_file_path_; + const std::string trace_file_path_; + const std::string output_miss_ratio_curve_path_; + BlockCacheTraceHeader header_; + std::unique_ptr cache_simulator_; std::map cf_aggregates_map_; }; +int block_cache_trace_analyzer_tool(int argc, char** argv); + } // namespace rocksdb diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index a75804492f6..df99e1f616e 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -3,6 +3,18 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, + "Please install gflags to run block_cache_trace_analyzer_test\n"); + return 1; +} +#else + +#include +#include #include #include @@ -25,6 +37,8 @@ const uint64_t kSSTStoringEvenKeys = 100; const uint64_t kSSTStoringOddKeys = 101; const std::string kRefKeyPrefix = "test-get-"; const uint64_t kNumKeysInBlock = 1024; +const int kMaxArgCount = 100; +const size_t kArgBufferSize = 100000; } // namespace class BlockCacheTracerTest : public testing::Test { @@ -34,6 +48,8 @@ class BlockCacheTracerTest : public testing::Test { env_ = rocksdb::Env::Default(); EXPECT_OK(env_->CreateDir(test_path_)); trace_file_path_ = test_path_ + "/block_cache_trace"; + block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config"; + output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve"; } ~BlockCacheTracerTest() override { @@ -125,12 +141,94 @@ class BlockCacheTracerTest : public testing::Test { } } + void RunBlockCacheTraceAnalyzer() { + std::vector params = { + "./block_cache_trace_analyzer", + "-block_cache_trace_path=" + trace_file_path_, + "-block_cache_sim_config_path=" + block_cache_sim_config_path_, + "-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_, + "-print_block_size_stats", + "-print_access_count_stats", + "-print_data_block_access_count_stats", + "-cache_sim_warmup_seconds=0"}; + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + for (const auto& arg : params) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + + argv[argc++] = arg_buffer + cursor; + cursor += static_cast(arg.size()) + 1; + } + ASSERT_EQ(0, rocksdb::block_cache_trace_analyzer_tool(argc, argv)); + } + Env* env_; EnvOptions env_options_; + std::string output_miss_ratio_curve_path_; + std::string block_cache_sim_config_path_; std::string trace_file_path_; std::string test_path_; }; +TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { + { + // Generate a trace file. + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer)); + ASSERT_OK(writer.WriteHeader()); + WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + { + // Generate a cache sim config. + std::string config = "lru,1,1K,1M,1G"; + std::ofstream out(block_cache_sim_config_path_); + ASSERT_TRUE(out.is_open()); + out << config << std::endl; + out.close(); + } + RunBlockCacheTraceAnalyzer(); + { + // Validate the cache miss ratios. + const std::vector expected_capacities{1024, 1024 * 1024, + 1024 * 1024 * 1024}; + std::ifstream infile(output_miss_ratio_curve_path_); + uint32_t config_index = 0; + std::string line; + // Read header. + ASSERT_TRUE(getline(infile, line)); + while (getline(infile, line)) { + std::stringstream ss(line); + std::vector result_strs; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + result_strs.push_back(substr); + } + ASSERT_EQ(5, result_strs.size()); + ASSERT_LT(config_index, expected_capacities.size()); + ASSERT_EQ("lru", result_strs[0]); // cache_name + ASSERT_EQ("1", result_strs[1]); // num_shard_bits + ASSERT_EQ(std::to_string(expected_capacities[config_index]), + result_strs[2]); // cache_capacity + ASSERT_EQ("100.0000", result_strs[3]); // miss_ratio + ASSERT_EQ("50", result_strs[4]); // number of accesses. + config_index++; + } + ASSERT_EQ(expected_capacities.size(), config_index); + infile.close(); + } + ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_)); + ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_)); +} + TEST_F(BlockCacheTracerTest, MixedBlocks) { { // Generate a trace file containing a mix of blocks. @@ -164,7 +262,9 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { ASSERT_EQ(kMajorVersion, header.rocksdb_major_version); ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version); // Read blocks. - BlockCacheTraceAnalyzer analyzer(trace_file_path_); + BlockCacheTraceAnalyzer analyzer(trace_file_path_, + /*output_miss_ratio_curve_path=*/"", + /*simulator=*/nullptr); // The analyzer ends when it detects an incomplete access record. ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); const uint64_t expected_num_cfs = 1; @@ -228,3 +328,12 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } +#endif // GFLAG +#else +#include +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n"); + return 0; +} +#endif // ROCKSDB_LITE diff --git a/tools/block_cache_trace_analyzer_tool.cc b/tools/block_cache_trace_analyzer_tool.cc new file mode 100644 index 00000000000..b7b36c5d241 --- /dev/null +++ b/tools/block_cache_trace_analyzer_tool.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else // GFLAGS +#include "tools/block_cache_trace_analyzer.h" +int main(int argc, char** argv) { + return rocksdb::block_cache_trace_analyzer_tool(argc, argv); +} +#endif // GFLAGS +#else // ROCKSDB_LITE +#include +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index 8629b60b095..f6f1e671450 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -152,10 +152,9 @@ class SimCacheImpl : public SimCache { public: // capacity for real cache (ShardedLRUCache) // test_capacity for key only cache - SimCacheImpl(std::shared_ptr cache, size_t sim_capacity, - int num_shard_bits) + SimCacheImpl(std::shared_ptr sim_cache, std::shared_ptr cache) : cache_(cache), - key_only_cache_(NewLRUCache(sim_capacity, num_shard_bits)), + key_only_cache_(sim_cache), miss_times_(0), hit_times_(0), stats_(nullptr) {} @@ -185,7 +184,9 @@ class SimCacheImpl : public SimCache { } cache_activity_logger_.ReportAdd(key, charge); - + if (!cache_) { + return Status::OK(); + } return cache_->Insert(key, value, charge, deleter, handle, priority); } @@ -201,7 +202,9 @@ class SimCacheImpl : public SimCache { } cache_activity_logger_.ReportLookup(key); - + if (!cache_) { + return nullptr; + } return cache_->Lookup(key, stats); } @@ -326,10 +329,17 @@ class SimCacheImpl : public SimCache { // For instrumentation purpose, use NewSimCache instead std::shared_ptr NewSimCache(std::shared_ptr cache, size_t sim_capacity, int num_shard_bits) { + return NewSimCache(NewLRUCache(sim_capacity, num_shard_bits), cache, + num_shard_bits); +} + +std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } - return std::make_shared(cache, sim_capacity, num_shard_bits); + return std::make_shared(sim_cache, cache); } } // end namespace rocksdb From bcfc53b436b386d5a894bf10678b38c058aa1624 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 17 Jun 2019 17:56:09 -0700 Subject: [PATCH 154/572] Block cache tracing: Fix minor bugs with downsampling and some benchmark results. (#5473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: As the code changes for block cache tracing are almost complete, I did a benchmark to compare the performance when block cache tracing is enabled/disabled. With 1% downsampling ratio, the performance overhead of block cache tracing is negligible. When we trace all block accesses, the throughput drops by 6 folds with 16 threads issuing random reads and all reads are served in block cache. Setup: RocksDB: version 6.2 Date: Mon Jun 17 17:11:13 2019 CPU: 24 * Intel Core Processor (Skylake) CPUCache: 16384 KB Keys: 20 bytes each Values: 100 bytes each (100 bytes after compression) Entries: 10000000 Prefix: 20 bytes Keys per prefix: 0 RawSize: 1144.4 MB (estimated) FileSize: 1144.4 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: NoCompression Compression sampling rate: 0 Memtablerep: skip_list Perf Level: 1 I ran the readrandom workload for 1 minute. Detailed throughput results: (ops/second) Sample rate 0: no block cache tracing. Sample rate 1: trace all block accesses. Sample rate 100: trace accesses 1% blocks. 1 thread |   |   |  -- | -- | -- | -- Sample rate | 0 | 1 | 100 1 MB block cache size | 13,094 | 13,166 | 13,341 10 GB block cache size | 202,243 | 188,677 | 229,182 16 threads |   |   | -- | -- | -- | -- Sample rate | 0 | 1 | 100 1 MB block cache size | 208,761 | 178,700 | 201,872 10 GB block cache size | 2,645,996 | 426,295 | 2,587,605 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5473 Differential Revision: D15869479 Pulled By: HaoyuHuang fbshipit-source-id: 7ae802abe84811281a6af8649f489887cd7c4618 --- tools/block_cache_trace_analyzer.cc | 2 +- trace_replay/block_cache_tracer.cc | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 0ef4b55e46f..3fd93a0239b 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -442,7 +442,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { caller_bt_num_access_map[caller][type] += num_accesses; caller_level_num_access_map[caller][level] += num_accesses; // Column Family stats. - cf_num_accesses++; + cf_num_accesses += num_accesses; cf_caller_num_accesses_map[caller] += num_accesses; cf_caller_level_num_accesses_map[caller][level] += num_accesses; cf_caller_file_num_accesses_map[caller][fd] += num_accesses; diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index f733bc9005f..a0f0676eecf 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -16,15 +16,14 @@ namespace rocksdb { namespace { const unsigned int kCharSize = 1; -bool ShouldTrace(const BlockCacheTraceRecord& record, - const TraceOptions& trace_options) { +bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) { if (trace_options.sampling_frequency == 0 || trace_options.sampling_frequency == 1) { return true; } // We use spatial downsampling so that we have a complete access history for a // block. - const uint64_t hash = GetSliceNPHash64(Slice(record.block_key)); + const uint64_t hash = GetSliceNPHash64(block_key); return hash % trace_options.sampling_frequency == 0; } } // namespace @@ -255,7 +254,7 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record, const Slice& block_key, const Slice& cf_name, const Slice& referenced_key) { - if (!writer_.load() || !ShouldTrace(record, trace_options_)) { + if (!writer_.load() || !ShouldTrace(block_key, trace_options_)) { return Status::OK(); } InstrumentedMutexLock lock_guard(&trace_writer_mutex_); From ddd088c8b91f8f63a110cb3262cc4e4d22fab7ca Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Mon, 17 Jun 2019 21:12:37 -0700 Subject: [PATCH 155/572] fix rocksdb lite and clang contrun test failures (#5477) Summary: recent commit 671d15cbdd3839acb54cb21a2aa82efca4917155 introduced some test failures: ``` ===== Running stats_history_test [==========] Running 9 tests from 1 test case. [----------] Global test environment set-up. [----------] 9 tests from StatsHistoryTest [ RUN ] StatsHistoryTest.RunStatsDumpPeriodSec monitoring/stats_history_test.cc:63: Failure dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}) Not implemented: Not supported in ROCKSDB LITE db/db_options_test.cc:28:11: error: unused variable 'kMicrosInSec' [-Werror,-Wunused-const-variable] const int kMicrosInSec = 1000000; ``` This PR fixes these failures Pull Request resolved: https://github.com/facebook/rocksdb/pull/5477 Differential Revision: D15871814 Pulled By: miasantreble fbshipit-source-id: 0a7023914d2c1784d9d2d3f5bfb47310d4855394 --- db/db_options_test.cc | 2 -- monitoring/stats_history_test.cc | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 7dd672646b5..fd8d849cd56 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -25,8 +25,6 @@ namespace rocksdb { -const int kMicrosInSec = 1000000; - class DBOptionsTest : public DBTestBase { public: DBOptionsTest() : DBTestBase("/db_options_test") {} diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index a66043da1fe..16681fe05d8 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -30,6 +30,7 @@ class StatsHistoryTest : public DBTestBase { public: StatsHistoryTest() : DBTestBase("/stats_history_test") {} }; +#ifndef ROCKSDB_LITE TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { Options options; @@ -566,6 +567,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); } +#endif // !ROCKSDB_LITE } // namespace rocksdb From f287f8dc930f0e5455cc236b65960abce6e7bbf0 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 18 Jun 2019 11:16:57 -0700 Subject: [PATCH 156/572] Fix a bug caused by secondary not skipping the beginning of new MANIFEST (#5472) Summary: While the secondary is replaying after the primary, the primary may switch to a new MANIFEST. The secondary is already able to detect and follow the primary to the new MANIFEST. However, the current implementation has a bug, described as follows. The new MANIFEST's first records have been generated by VersionSet::WriteSnapshot to describe the current state of the column families and the db as of the MANIFEST creation. Since the secondary instance has already finished recovering upon start, there is no need for the secondary to process these records. Actually, if the secondary were to replay these records, the secondary may end up adding the same SST files **again** to each column family, causing consistency checks done by VersionBuilder to fail. Therefore, we record the number of records to skip at the beginning of the new MANIFEST and ignore them. Test plan (on dev server) ``` $make clean && make -j32 all $./db_secondary_test ``` All existing unit tests must pass as well. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5472 Differential Revision: D15866771 Pulled By: riversand963 fbshipit-source-id: a1eec4837fb2ad13059398efb0f437e74fd53bed --- HISTORY.md | 1 + db/db_impl/db_secondary_test.cc | 28 +++++++++++++++++++ db/version_set.cc | 48 +++++++++++++++++++++++++++++---- db/version_set.h | 3 +++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 228d02b61df..0b6409dbe47 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -28,6 +28,7 @@ * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. +* Fix a bug caused by secondary not skipping the beginning of new MANIFEST. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index c79589d5022..26f43c10745 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -525,6 +525,34 @@ TEST_F(DBSecondaryTest, SwitchManifest) { range_scan_db(); } +// Here, "Snapshot" refers to the version edits written by +// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after +// switching from the old one. +TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); +} + TEST_F(DBSecondaryTest, SwitchWAL) { const int kNumKeysPerMemtable = 1; Options options; diff --git a/db/version_set.cc b/db/version_set.cc index ccedca7940d..9978c8cd463 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5217,7 +5217,8 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, WriteController* write_controller) : VersionSet(dbname, _db_options, _env_options, table_cache, write_buffer_manager, write_controller, - /*block_cache_tracer=*/nullptr) {} + /*block_cache_tracer=*/nullptr), + number_of_edits_to_skip_(0) {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -5415,6 +5416,17 @@ Status ReactiveVersionSet::ReadAndApply( break; } + // Skip the first VersionEdits of each MANIFEST generated by + // VersionSet::WriteSnapshot. + if (number_of_edits_to_skip_ > 0) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd != nullptr && !cfd->IsDropped()) { + --number_of_edits_to_skip_; + } + continue; + } + s = read_buffer_.AddEdit(&edit); if (!s.ok()) { break; @@ -5463,8 +5475,33 @@ Status ReactiveVersionSet::ReadAndApply( // find the next MANIFEST, we should exit the loop. s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); reader = manifest_reader->get(); - if (s.ok() && reader->file()->file_name() == old_manifest_path) { - break; + if (s.ok()) { + if (reader->file()->file_name() == old_manifest_path) { + // Still processing the same MANIFEST, thus no need to continue this + // loop since no record is available if we have reached here. + break; + } else { + // We have switched to a new MANIFEST whose first records have been + // generated by VersionSet::WriteSnapshot. Since the secondary instance + // has already finished recovering upon start, there is no need for the + // secondary to process these records. Actually, if the secondary were + // to replay these records, the secondary may end up adding the same + // SST files AGAIN to each column family, causing consistency checks + // done by VersionBuilder to fail. Therefore, we record the number of + // records to skip at the beginning of the new MANIFEST and ignore + // them. + number_of_edits_to_skip_ = 0; + for (auto* cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + // Increase number_of_edits_to_skip by 2 because WriteSnapshot() + // writes 2 version edits for each column family at the beginning of + // the newly-generated MANIFEST. + // TODO(yanqin) remove hard-coded value. + number_of_edits_to_skip_ += 2; + } + } } } @@ -5504,7 +5541,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( return Status::OK(); } if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end()) { + active_version_builders_.end() && !cfd->IsDropped()) { std::unique_ptr builder_guard( new BaseReferencedVersionBuilder(cfd)); active_version_builders_.insert( @@ -5532,6 +5569,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( delete cfd; cfd = nullptr; } + active_version_builders_.erase(builder_iter); } else { builder->Apply(&edit); } @@ -5543,7 +5581,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( return s; } - if (cfd != nullptr) { + if (cfd != nullptr && !cfd->IsDropped()) { s = builder->LoadTableHandlers( cfd->internal_stats(), db_options_->max_file_opening_threads, false /* prefetch_index_and_filter_in_cache */, diff --git a/db/version_set.h b/db/version_set.h index 90be94a789a..ba1b4d3e3d0 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1195,6 +1195,9 @@ class ReactiveVersionSet : public VersionSet { std::unordered_map> active_version_builders_; AtomicGroupReadBuffer read_buffer_; + // Number of version edits to skip by ReadAndApply at the beginning of a new + // MANIFEST created by primary. + int number_of_edits_to_skip_; using VersionSet::LogAndApply; using VersionSet::Recover; From 4bd0cf541dc46cf2320311f047aaa559d5d40d3a Mon Sep 17 00:00:00 2001 From: siddontang Date: Tue, 18 Jun 2019 11:20:52 -0700 Subject: [PATCH 157/572] build on ARM64 (#5450) Summary: Support building RocksDB on AWS ARM64 ``` uname -m aarch64 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5450 Differential Revision: D15879851 fbshipit-source-id: a9b56520a2cd9921338305a06d7103a40a3300b8 --- build_tools/build_detect_platform | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index ac30f9ab0fa..4a52c6cddb7 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -537,7 +537,7 @@ if test -z "$PORTABLE"; then COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER " elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then COMMON_FLAGS="$COMMON_FLAGS -march=z10 " - elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then + elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then # TODO: Handle this with approprite options. COMMON_FLAGS="$COMMON_FLAGS" elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then From 5dc9fbd1175ad10454b877d9044c4b909d00ae3b Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Tue, 18 Jun 2019 11:53:43 -0700 Subject: [PATCH 158/572] Update the version of ZStd for the Rocks Java static build Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5228 Differential Revision: D15880451 Pulled By: sagar0 fbshipit-source-id: 84da6f42cac15367d95bffa5336ebd002e7c3308 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8e8c0ac7638..b0b52a37365 100644 --- a/Makefile +++ b/Makefile @@ -1684,8 +1684,8 @@ SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive LZ4_VER ?= 1.8.3 LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.3.7 -ZSTD_SHA256 ?= 5dd1e90eb16c25425880c8a91327f63de22891ffed082fcc17e5ae84fce0d5fb +ZSTD_VER ?= 1.4.0 +ZSTD_SHA256 ?= 63be339137d2b683c6d19a9e34f4fb684790e864fee13c7dd40e197a64c705c1 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 From 220870523cdfe100fadd29ec98cabd83a8112f82 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 18 Jun 2019 14:52:44 -0700 Subject: [PATCH 159/572] Fix compilation with USE_HDFS (#5444) Summary: The changes in https://github.com/facebook/rocksdb/commit/8272a6de57ed701fb25bb660e074cab703ed3fe7 were untested with `USE_HDFS=1`. There were a couple compiler errors. This PR fixes them. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5444 Test Plan: ``` $ EXTRA_LDFLAGS="-L/tmp/hadoop-3.1.2/lib/native/" EXTRA_CXXFLAGS="-I/tmp/hadoop-3.1.2/include" USE_HDFS=1 make -j12 check ``` Differential Revision: D15885009 fbshipit-source-id: 2a0a63739e0b9a2819b461ad63ce1292c4833fe2 --- env/env_hdfs.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index 5bdf03ae3e1..207f0815bc4 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -420,7 +420,7 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname, // create a new file for writing Status HdfsEnv::NewWritableFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& /*options*/) { + const EnvOptions& options) { result->reset(); Status s; HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options); @@ -590,6 +590,11 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); } Status HdfsEnv::NewLogger(const std::string& fname, std::shared_ptr* result) { + // EnvOptions is used exclusively for its `strict_bytes_per_sync` value. That + // option is only intended for WAL/flush/compaction writes, so turn it off in + // the logger. + EnvOptions options; + options.strict_bytes_per_sync = false; HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options); if (f == nullptr || !f->isValid()) { delete f; From d0c6aea192f546fc049c90d2782636603c1a80f0 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 18 Jun 2019 14:53:35 -0700 Subject: [PATCH 160/572] Revert to respecting only the read_tier read option for index blocks (#5481) Summary: PR https://github.com/facebook/rocksdb/issues/5298 subtly changed how read options are applied to the index block during a Get, MultiGet, or iteration. Earlier, only the read_tier option applied to the index block read; since PR https://github.com/facebook/rocksdb/issues/5298, fill_cache and verify_checksums also have an effect. This patch restores the earlier behavior to prevent surprise memory increases for clients due to the index block not being cached. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5481 Test Plan: make check Differential Revision: D15883082 Pulled By: ltamasi fbshipit-source-id: 9a065ec3a6db5a365cf6dd5e95190a20c5756356 --- table/block_based/block_based_table_reader.cc | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 0caea508822..adc5eb6b044 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -210,8 +210,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { return properties == nullptr || !properties->index_value_is_delta_encoded; } - Status GetOrReadIndexBlock(const ReadOptions& read_options, - GetContext* get_context, + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* index_block) const; @@ -250,7 +249,7 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( } Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( - const ReadOptions& read_options, GetContext* get_context, + bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* index_block) const { assert(index_block != nullptr); @@ -260,6 +259,11 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( return Status::OK(); } + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, get_context, lookup_context, index_block); } @@ -304,9 +308,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - lookup_context, &index_block); + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -366,7 +371,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { Statistics* kNullStats = nullptr; CachableEntry index_block; - Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */, + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, &lookup_context, &index_block); if (!s.ok()) { ROCKS_LOG_WARN(rep->ioptions.info_log, @@ -489,9 +494,10 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - lookup_context, &index_block); + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -631,9 +637,10 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = GetOrReadIndexBlock(read_options, get_context, - lookup_context, &index_block); + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); From 92f631da33e88ce63f1546c3a4865cc4dc1d4e13 Mon Sep 17 00:00:00 2001 From: Huisheng Liu Date: Tue, 18 Jun 2019 16:35:57 -0700 Subject: [PATCH 161/572] replace sprintf with its safe version snprintf (#5475) Summary: sprintf is unsafe and has buffer overrun risk. Replace it with the safer version snprintf where buffer size is supplied to avoid overrun. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5475 Differential Revision: D15879481 Pulled By: sagar0 fbshipit-source-id: 7ae1958ffc9727fa50261dfbb98ddd74e70a72d8 --- tools/trace_analyzer_tool.cc | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 627610ae0f4..9ee746af4a2 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -572,7 +572,7 @@ Status TraceAnalyzer::MakeStatistics() { // output the access count distribution if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) { for (auto& record : stat.second.a_count_stats) { - ret = sprintf(buffer_, "access_count: %" PRIu64 " num: %" PRIu64 "\n", + ret = snprintf(buffer_, sizeof(buffer_), "access_count: %" PRIu64 " num: %" PRIu64 "\n", record.first, record.second); if (ret < 0) { return Status::IOError("Format the output failed"); @@ -596,7 +596,7 @@ Status TraceAnalyzer::MakeStatistics() { get_mid = true; } if (FLAGS_output_key_distribution && stat.second.a_key_size_f) { - ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", record.first, + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n", record.first, record.second); if (ret < 0) { return Status::IOError("Format output failed"); @@ -624,7 +624,7 @@ Status TraceAnalyzer::MakeStatistics() { if (FLAGS_output_value_distribution && stat.second.a_value_size_f && (type == TraceOperationType::kPut || type == TraceOperationType::kMerge)) { - ret = sprintf(buffer_, + ret = snprintf(buffer_, sizeof(buffer_), "Number_of_value_size_between %" PRIu64 " and %" PRIu64 " is: %" PRIu64 "\n", v_begin, v_end, record.second); @@ -675,7 +675,7 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) { succ_ratio = (static_cast(record.second.succ_count)) / record.second.access_count; } - ret = sprintf(buffer_, "%u %zu %" PRIu64 " %" PRIu64 " %f\n", + ret = snprintf(buffer_, sizeof(buffer_), "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id, record.second.value_size, record.second.key_id, record.second.access_count, succ_ratio); if (ret < 0) { @@ -703,7 +703,7 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) { prefix_succ_ratio = (static_cast(prefix_succ_access)) / prefix_access; } - ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n", + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n", record.second.key_id, prefix_access, prefix_count, prefix_ave_access, prefix_succ_ratio, prefix_out.c_str()); if (ret < 0) { @@ -809,7 +809,7 @@ Status TraceAnalyzer::MakeStatisticQPS() { } if (stat.second.a_qps_f) { while (time_line < time_it.first) { - ret = sprintf(buffer_, "%u\n", 0); + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0); if (ret < 0) { return Status::IOError("Format the output failed"); } @@ -821,7 +821,7 @@ Status TraceAnalyzer::MakeStatisticQPS() { } time_line++; } - ret = sprintf(buffer_, "%u\n", time_it.second); + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second); if (ret < 0) { return Status::IOError("Format the output failed"); } @@ -870,7 +870,7 @@ Status TraceAnalyzer::MakeStatisticQPS() { cur_ratio = (static_cast(find_time->second)) / cur_uni_key; cur_num = find_time->second; } - ret = sprintf(buffer_, "%" PRIu64 " %.12f\n", cur_num, cur_ratio); + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n", cur_num, cur_ratio); if (ret < 0) { return Status::IOError("Format the output failed"); } @@ -887,7 +887,7 @@ Status TraceAnalyzer::MakeStatisticQPS() { // output the prefix of top k access peak if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) { while (!stat.second.top_k_qps_sec.empty()) { - ret = sprintf(buffer_, "At time: %u with QPS: %u\n", + ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n", stat.second.top_k_qps_sec.top().second, stat.second.top_k_qps_sec.top().first); if (ret < 0) { @@ -906,7 +906,7 @@ Status TraceAnalyzer::MakeStatisticQPS() { for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) { std::string qps_prefix_out = rocksdb::LDBCommand::StringToHex(qps_prefix.first); - ret = sprintf(buffer_, "The prefix: %s Access count: %u\n", + ret = snprintf(buffer_, sizeof(buffer_), "The prefix: %s Access count: %u\n", qps_prefix_out.c_str(), qps_prefix.second); if (ret < 0) { return Status::IOError("Format the output failed"); @@ -928,9 +928,9 @@ Status TraceAnalyzer::MakeStatisticQPS() { for (uint32_t i = 0; i < duration; i++) { for (int type = 0; type <= kTaTypeNum; type++) { if (type < kTaTypeNum) { - ret = sprintf(buffer_, "%u ", type_qps[i][type]); + ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]); } else { - ret = sprintf(buffer_, "%u\n", type_qps[i][type]); + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]); } if (ret < 0) { return Status::IOError("Format the output failed"); @@ -959,9 +959,9 @@ Status TraceAnalyzer::MakeStatisticQPS() { v = 0; } if (cf < cfs_size - 1) { - ret = sprintf(buffer_, "%u ", v); + ret = snprintf(buffer_, sizeof(buffer_), "%u ", v); } else { - ret = sprintf(buffer_, "%u\n", v); + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v); } if (ret < 0) { return Status::IOError("Format the output failed"); @@ -1016,7 +1016,7 @@ Status TraceAnalyzer::ReProcessing() { if (found != stat.a_key_stats.end()) { key_id = found->second.key_id; } - ret = sprintf(buffer_, "%u %" PRIu64 " %" PRIu64 "\n", + ret = snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n", stat.time_series.front().type, stat.time_series.front().ts, key_id); if (ret < 0) { @@ -1064,7 +1064,7 @@ Status TraceAnalyzer::ReProcessing() { TraceStats& stat = ta_[type].stats[cf_id]; if (stat.w_key_f) { if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) { - ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count, stat.a_key_stats[input_key].access_count); if (ret < 0) { @@ -1086,7 +1086,7 @@ Status TraceAnalyzer::ReProcessing() { prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut); std::string prefix_out = rocksdb::LDBCommand::StringToHex(prefix[type]); - ret = sprintf(buffer_, "%" PRIu64 " %s\n", cfs_[cf_id].w_count, + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n", cfs_[cf_id].w_count, prefix_out.c_str()); if (ret < 0) { return Status::IOError("Format the output failed"); @@ -1904,7 +1904,7 @@ Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type, std::string hex_key = rocksdb::LDBCommand::StringToHex(key); int ret; ret = - sprintf(buffer_, "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts); + snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts); if (ret < 0) { return Status::IOError("failed to format the output"); } From f46a2a03759a11731d62f01a0707a44ccab4cfbc Mon Sep 17 00:00:00 2001 From: Vaibhav Gogte Date: Tue, 18 Jun 2019 17:32:44 -0700 Subject: [PATCH 162/572] Export Cache::GetCharge (#5476) Summary: Exporting GetCharge to cache.hh Pull Request resolved: https://github.com/facebook/rocksdb/pull/5476 Differential Revision: D15881882 Pulled By: riversand963 fbshipit-source-id: 3d99084d10059b4fcaaaba240606ed50bc23351c --- cache/cache_test.cc | 8 ++++++++ cache/sharded_cache.h | 3 ++- include/rocksdb/cache.h | 3 +++ utilities/simulator_cache/sim_cache.cc | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 0cc3d559502..d7b191bb31f 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -686,6 +686,14 @@ TEST_P(CacheTest, DefaultShardBits) { ASSERT_EQ(6, sc->GetNumShardBits()); } +TEST_P(CacheTest, GetCharge) { + Insert(1, 2); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); + ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); + ASSERT_EQ(1, cache_->GetCharge(h1)); + cache_->Release(h1); +} + #ifdef SUPPORT_CLOCK_CACHE std::shared_ptr (*new_clock_cache_func)(size_t, int, bool) = NewClockCache; diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index 920898b871f..0c1499f22dd 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -54,7 +54,8 @@ class ShardedCache : public Cache { virtual CacheShard* GetShard(int shard) = 0; virtual const CacheShard* GetShard(int shard) const = 0; virtual void* Value(Handle* handle) override = 0; - virtual size_t GetCharge(Handle* handle) const = 0; + virtual size_t GetCharge(Handle* handle) const override = 0; + virtual uint32_t GetHash(Handle* handle) const = 0; virtual void DisownData() override = 0; diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index ed7790aebb5..8fb691559d0 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -226,6 +226,9 @@ class Cache { // returns the memory size for the entries in use by the system virtual size_t GetPinnedUsage() const = 0; + // returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak // memory - call this only if you're shutting down the process. diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index f6f1e671450..d84a593b9d5 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -235,6 +235,8 @@ class SimCacheImpl : public SimCache { return cache_->GetUsage(handle); } + size_t GetCharge(Handle* handle) const override { return cache_->GetCharge(handle); } + size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } void DisownData() override { From 2e8ad03ab3f9e498e682dd74600b0f8b5fc02d67 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Tue, 18 Jun 2019 18:34:39 -0700 Subject: [PATCH 163/572] Add more stats in the block cache trace analyzer (#5482) Summary: This PR adds more stats in the block cache trace analyzer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5482 Differential Revision: D15883553 Pulled By: HaoyuHuang fbshipit-source-id: 6d440e4f657af75690420102d532d0ee1ed4e9cf --- tools/block_cache_trace_analyzer.cc | 143 +++++++++++++++++++++------- tools/block_cache_trace_analyzer.h | 3 +- 2 files changed, 112 insertions(+), 34 deletions(-) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 3fd93a0239b..a8259de71b5 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -27,6 +27,10 @@ DEFINE_string( "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. " "cache_name is lru. cache_capacity can be xK, xM or xG " "where x is a positive number."); +DEFINE_int32(block_cache_trace_downsample_ratio, 1, + "The trace collected accesses on one in every " + "block_cache_trace_downsample_ratio blocks. We scale " + "down the simulated cache size by this ratio."); DEFINE_bool(print_block_size_stats, false, "Print block size distribution and the distribution break down by " "block type and column family."); @@ -91,18 +95,30 @@ void print_break_lines(uint32_t num_break_lines) { } } +double percent(uint64_t numerator, uint64_t denomenator) { + if (denomenator == 0) { + return -1; + } + return static_cast(numerator * 100.0 / denomenator); +} + } // namespace BlockCacheTraceSimulator::BlockCacheTraceSimulator( - uint64_t warmup_seconds, + uint64_t warmup_seconds, uint32_t downsample_ratio, const std::vector& cache_configurations) : warmup_seconds_(warmup_seconds), + downsample_ratio_(downsample_ratio), cache_configurations_(cache_configurations) { for (auto const& config : cache_configurations_) { for (auto cache_capacity : config.cache_capacities) { - sim_caches_.push_back( - NewSimCache(NewLRUCache(cache_capacity, config.num_shard_bits), - /*real_cache=*/nullptr, config.num_shard_bits)); + // Scale down the cache capacity since the trace contains accesses on + // 1/'downsample_ratio' blocks. + uint64_t simulate_cache_capacity = + cache_capacity / downsample_ratio_; + sim_caches_.push_back(NewSimCache( + NewLRUCache(simulate_cache_capacity, config.num_shard_bits), + /*real_cache=*/nullptr, config.num_shard_bits)); } } } @@ -285,11 +301,12 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { } } } - fprintf(stdout, "Block access count stats: \n%s", + fprintf(stdout, + "Block access count stats: The number of accesses per block.\n%s", access_stats.ToString().c_str()); for (auto const& bt_stats : bt_stats_map) { print_break_lines(/*num_break_lines=*/1); - fprintf(stdout, "Block access count stats for block type %s: \n%s", + fprintf(stdout, "Break down by block type %s: \n%s", block_type_to_string(bt_stats.first).c_str(), bt_stats.second.ToString().c_str()); } @@ -298,7 +315,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { for (auto const& bt_stats : cf_bt_stats.second) { print_break_lines(/*num_break_lines=*/1); fprintf(stdout, - "Block access count stats for column family %s and block type " + "Break down by column family %s and block type " "%s: \n%s", cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), bt_stats.second.ToString().c_str()); @@ -313,6 +330,15 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { std::map cf_non_existing_keys_stats_map; HistogramStat block_access_stats; std::map cf_block_access_info; + HistogramStat percent_referenced_bytes; + std::map cf_percent_referenced_bytes; + // Total number of accesses in a data block / number of keys in a data block. + HistogramStat avg_naccesses_per_key_in_a_data_block; + std::map cf_avg_naccesses_per_key_in_a_data_block; + // The standard deviation on the number of accesses of a key in a data block. + HistogramStat stdev_naccesses_per_key_in_a_data_block; + std::map + cf_stdev_naccesses_per_key_in_a_data_block; for (auto const& cf_aggregates : cf_aggregates_map_) { // Stats per column family. @@ -343,6 +369,20 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { block_access_info.second.num_referenced_key_exist_in_block / (double)block_access_info.second.num_accesses) * 10000.0); + + HistogramStat hist_naccess_per_key; + for (auto const& key_access : + block_access_info.second.key_num_access_map) { + hist_naccess_per_key.Add(key_access.second); + } + uint64_t avg_accesses = hist_naccess_per_key.Average(); + uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation(); + avg_naccesses_per_key_in_a_data_block.Add(avg_accesses); + cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses); + stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses); + cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add( + stdev_accesses); + existing_keys_stats.Add(percent_referenced_for_existing_keys); cf_existing_keys_stats_map[cf_name].Add( percent_referenced_for_existing_keys); @@ -356,7 +396,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { } } fprintf(stdout, - "Histogram on percentage of referenced keys existing in a block over " + "Histogram on the number of referenced keys existing in a block over " "the total number of keys in a block: \n%s", existing_keys_stats.ToString().c_str()); for (auto const& cf_stats : cf_existing_keys_stats_map) { @@ -367,7 +407,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { print_break_lines(/*num_break_lines=*/1); fprintf( stdout, - "Histogram on percentage of referenced keys DO NOT exist in a block over " + "Histogram on the number of referenced keys DO NOT exist in a block over " "the total number of keys in a block: \n%s", non_existing_keys_stats.ToString().c_str()); for (auto const& cf_stats : cf_non_existing_keys_stats_map) { @@ -377,7 +417,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { } print_break_lines(/*num_break_lines=*/1); fprintf(stdout, - "Histogram on percentage of accesses on keys exist in a block over " + "Histogram on the number of accesses on keys exist in a block over " "the total number of accesses in a block: \n%s", block_access_stats.ToString().c_str()); for (auto const& cf_stats : cf_block_access_info) { @@ -385,6 +425,24 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { fprintf(stdout, "Break down by column family %s: \n%s", cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); } + print_break_lines(/*num_break_lines=*/1); + fprintf( + stdout, + "Histogram on the average number of accesses per key in a block: \n%s", + avg_naccesses_per_key_in_a_data_block.ToString().c_str()); + for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Histogram on the standard deviation of the number of accesses per " + "key in a block: \n%s", + stdev_naccesses_per_key_in_a_data_block.ToString().c_str()); + for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } } void BlockCacheTraceAnalyzer::PrintStatsSummary() const { @@ -456,40 +514,49 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { print_break_lines(/*num_break_lines=*/3); fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str()); fprintf(stdout, - "Number of files:%" PRIu64 "Number of blocks: %" PRIu64 - "Number of accesses: %" PRIu64 "\n", + " Number of files:%" PRIu64 " Number of blocks: %" PRIu64 + " Number of accesses: %" PRIu64 "\n", cf_num_files, cf_num_blocks, cf_num_accesses); for (auto block_type : cf_bt_blocks) { - fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n", - block_type_to_string(block_type.first).c_str(), - block_type.second); + fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n", + block_type_to_string(block_type.first).c_str(), block_type.second, + percent(block_type.second, cf_num_blocks)); } for (auto caller : cf_caller_num_accesses_map) { + const uint64_t naccesses = caller.second; print_break_lines(/*num_break_lines=*/1); - fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", - caller_to_string(caller.first).c_str(), caller.second); + fprintf(stdout, + "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n", + caller_to_string(caller.first).c_str(), naccesses, + percent(naccesses, cf_num_accesses)); fprintf(stdout, "Caller %s: Number of accesses per level break down\n", caller_to_string(caller.first).c_str()); for (auto naccess_level : cf_caller_level_num_accesses_map[caller.first]) { fprintf(stdout, - "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 "\n", - naccess_level.first, naccess_level.second); + "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 + " Percent: %.2f\n", + naccess_level.first, naccess_level.second, + percent(naccess_level.second, naccesses)); } fprintf(stdout, "Caller %s: Number of accesses per file break down\n", caller_to_string(caller.first).c_str()); for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) { fprintf(stdout, - "\t File %" PRIu64 ": Number of accesses: %" PRIu64 "\n", - naccess_file.first, naccess_file.second); + "\t File %" PRIu64 ": Number of accesses: %" PRIu64 + " Percent: %.2f\n", + naccess_file.first, naccess_file.second, + percent(naccess_file.second, naccesses)); } fprintf(stdout, "Caller %s: Number of accesses per block type break down\n", caller_to_string(caller.first).c_str()); for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) { - fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n", + fprintf(stdout, + "\t Block Type %s: Number of accesses: %" PRIu64 + " Percent: %.2f\n", block_type_to_string(naccess_type.first).c_str(), - naccess_type.second); + naccess_type.second, percent(naccess_type.second, naccesses)); } } } @@ -500,25 +567,32 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { " Number of accesses: %" PRIu64 "\n", total_num_files, total_num_blocks, total_num_accesses); for (auto block_type : bt_num_blocks_map) { - fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n", - block_type_to_string(block_type.first).c_str(), block_type.second); + fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n", + block_type_to_string(block_type.first).c_str(), block_type.second, + percent(block_type.second, total_num_blocks)); } for (auto caller : caller_num_access_map) { print_break_lines(/*num_break_lines=*/1); - fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n", - caller_to_string(caller.first).c_str(), caller.second); + uint64_t naccesses = caller.second; + fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n", + caller_to_string(caller.first).c_str(), naccesses, + percent(naccesses, total_num_accesses)); fprintf(stdout, "Caller %s: Number of accesses per level break down\n", caller_to_string(caller.first).c_str()); for (auto naccess_level : caller_level_num_access_map[caller.first]) { - fprintf(stdout, "\t Level %d: Number of accesses: %" PRIu64 "\n", - naccess_level.first, naccess_level.second); + fprintf(stdout, + "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n", + naccess_level.first, naccess_level.second, + percent(naccess_level.second, naccesses)); } fprintf(stdout, "Caller %s: Number of accesses per block type break down\n", caller_to_string(caller.first).c_str()); for (auto naccess_type : caller_bt_num_access_map[caller.first]) { - fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n", + fprintf(stdout, + "\t Block Type %s: Number of accesses: %" PRIu64 + " Percent: %.2f\n", block_type_to_string(naccess_type.first).c_str(), - naccess_type.second); + naccess_type.second, percent(naccess_type.second, naccesses)); } } } @@ -575,12 +649,15 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { } uint64_t warmup_seconds = FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0; + uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0 + ? FLAGS_block_cache_trace_downsample_ratio + : 0; std::vector cache_configs = parse_cache_config_file(FLAGS_block_cache_sim_config_path); std::unique_ptr cache_simulator; if (!cache_configs.empty()) { - cache_simulator.reset( - new BlockCacheTraceSimulator(warmup_seconds, cache_configs)); + cache_simulator.reset(new BlockCacheTraceSimulator( + warmup_seconds, downsample_ratio, cache_configs)); } BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, FLAGS_output_miss_ratio_curve_path, diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 1420906f3cf..0690d14d0f3 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -31,7 +31,7 @@ class BlockCacheTraceSimulator { // warmup_seconds: The number of seconds to warmup simulated caches. The // hit/miss counters are reset after the warmup completes. BlockCacheTraceSimulator( - uint64_t warmup_seconds, + uint64_t warmup_seconds, uint32_t downsample_ratio, const std::vector& cache_configurations); ~BlockCacheTraceSimulator() = default; // No copy and move. @@ -52,6 +52,7 @@ class BlockCacheTraceSimulator { private: const uint64_t warmup_seconds_; + const uint32_t downsample_ratio_; const std::vector cache_configurations_; bool warmup_complete_ = false; From 5355e527d9a4704f2057962f243318772896b4aa Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 18 Jun 2019 19:00:03 -0700 Subject: [PATCH 164/572] Make the 'block read count' performance counters consistent (#5484) Summary: The patch brings the semantics of per-block-type read performance context counters in sync with the generic block_read_count by only incrementing the counter if the block was actually read from the file. It also fixes index_block_read_count, which fell victim to the refactoring in PR https://github.com/facebook/rocksdb/issues/5298. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5484 Test Plan: Extended the unit tests. Differential Revision: D15887431 Pulled By: ltamasi fbshipit-source-id: a3889759d0ac5759d56625d692cd828d1b9207a6 --- HISTORY.md | 1 + table/block_based/block_based_table_reader.cc | 68 ++++++++++++++----- table/block_based/block_based_table_reader.h | 2 + table/block_based/block_type.h | 6 ++ table/block_fetcher.cc | 20 ++++++ table/block_fetcher.h | 5 +- table/meta_blocks.cc | 15 ++-- table/meta_blocks.h | 3 +- table/plain/plain_table_reader.cc | 5 +- table/table_test.cc | 22 ++++-- 10 files changed, 113 insertions(+), 34 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0b6409dbe47..18feefafce8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. +* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index adc5eb6b044..66fe34b95ea 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -80,14 +80,14 @@ Status ReadBlockFromFile( RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, std::unique_ptr* result, const ImmutableCFOptions& ioptions, - bool do_uncompress, bool maybe_compressed, + bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) { BlockContents contents; BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle, &contents, ioptions, do_uncompress, - maybe_compressed, uncompression_dict, + maybe_compressed, block_type, uncompression_dict, cache_options, memory_allocator); Status s = block_fetcher.ReadBlockContents(); if (s.ok()) { @@ -603,8 +603,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { BlockFetcher prefixes_block_fetcher( file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); + true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); s = prefixes_block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -613,8 +613,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { BlockFetcher prefixes_meta_block_fetcher( file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, &prefixes_meta_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); + true /*maybe_compressed*/, BlockType::kHashIndexMetadata, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); s = prefixes_meta_block_fetcher.ReadBlockContents(); if (!s.ok()) { // TODO: log error @@ -1373,7 +1373,8 @@ Status BlockBasedTable::ReadCompressionDictBlock( rep_->file.get(), prefetch_buffer, rep_->footer, read_options, rep_->compression_dict_handle, compression_dict_cont.get(), rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options); + BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(), + cache_options); s = compression_block_fetcher.ReadBlockContents(); if (!s.ok()) { @@ -1583,7 +1584,7 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, Status s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(), rep_->footer.metaindex_handle(), &meta, rep_->ioptions, - true /* decompress */, true /*maybe_compressed*/, + true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options)); @@ -1818,8 +1819,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter( BlockFetcher block_fetcher( rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), filter_handle, &block, rep->ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - rep->persistent_cache_options, GetMemoryAllocator(rep->table_options)); + false /*maybe_compressed*/, BlockType::kFilter, + UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, + GetMemoryAllocator(rep->table_options)); Status s = block_fetcher.ReadBlockContents(); if (!s.ok()) { @@ -1940,7 +1942,6 @@ CachableEntry BlockBasedTable::GetFilter( ? Cache::Priority::HIGH : Cache::Priority::LOW); if (s.ok()) { - PERF_COUNTER_ADD(filter_block_read_count, 1); UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage); } else { RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); @@ -2021,7 +2022,6 @@ CachableEntry BlockBasedTable::GetUncompressionDict( : Cache::Priority::LOW); if (s.ok()) { - PERF_COUNTER_ADD(compression_dict_block_read_count, 1); UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary, get_context, usage); dict = uncompression_dict.release(); @@ -2217,7 +2217,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &raw_block_contents, rep_->ioptions, do_decompress /* do uncompress */, rep_->blocks_maybe_compressed, - uncompression_dict, rep_->persistent_cache_options, + block_type, uncompression_dict, rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), GetMemoryAllocatorForCompressedBlock(rep_->table_options)); s = block_fetcher.ReadBlockContents(); @@ -2335,7 +2335,7 @@ Status BlockBasedTable::RetrieveBlock( s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, rep_->ioptions, rep_->blocks_maybe_compressed, - rep_->blocks_maybe_compressed, uncompression_dict, + rep_->blocks_maybe_compressed, block_type, uncompression_dict, rep_->persistent_cache_options, rep_->get_global_seqno(block_type), block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit @@ -3335,7 +3335,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks( BlockFetcher block_fetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, ReadOptions(), handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, + false /* decompress */, false /*maybe_compressed*/, BlockType::kData, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { @@ -3345,6 +3345,38 @@ Status BlockBasedTable::VerifyChecksumInBlocks( return s; } +BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( + const Slice& meta_block_name) { + if (meta_block_name.starts_with(kFilterBlockPrefix) || + meta_block_name.starts_with(kFullFilterBlockPrefix) || + meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) { + return BlockType::kFilter; + } + + if (meta_block_name == kPropertiesBlock) { + return BlockType::kProperties; + } + + if (meta_block_name == kCompressionDictBlock) { + return BlockType::kCompressionDictionary; + } + + if (meta_block_name == kRangeDelBlock) { + return BlockType::kRangeDeletion; + } + + if (meta_block_name == kHashIndexPrefixesBlock) { + return BlockType::kHashIndexPrefixes; + } + + if (meta_block_name == kHashIndexPrefixesMetadataBlock) { + return BlockType::kHashIndexMetadata; + } + + assert(false); + return BlockType::kInvalid; +} + Status BlockBasedTable::VerifyChecksumInMetaBlocks( InternalIteratorBase* index_iter) { Status s; @@ -3357,13 +3389,15 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( Slice input = index_iter->value(); s = handle.DecodeFrom(&input); BlockContents contents; + const Slice meta_block_name = index_iter->key(); BlockFetcher block_fetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, ReadOptions(), handle, &contents, rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); s = block_fetcher.ReadBlockContents(); - if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) { + if (s.IsCorruption() && meta_block_name == kPropertiesBlock) { TableProperties* table_properties; s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */, index_iter->value(), @@ -3662,7 +3696,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file, rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, ReadOptions(), handle, &block, rep_->ioptions, false /*decompress*/, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), + BlockType::kFilter, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 17c4e7238c8..3c92621bdcd 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -407,6 +407,8 @@ class BlockBasedTable : public TableReader { const BlockBasedTableOptions& table_options, const int level, BlockCacheLookupContext* lookup_context); + static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); + Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h index 9b9c53946c9..a60be2e6a70 100644 --- a/table/block_based/block_type.h +++ b/table/block_based/block_type.h @@ -5,6 +5,8 @@ #pragma once +#include + namespace rocksdb { // Represents the types of blocks used in the block based table format. @@ -17,8 +19,12 @@ enum class BlockType : uint8_t { kProperties, kCompressionDictionary, kRangeDeletion, + kHashIndexPrefixes, + kHashIndexMetadata, kMetaIndex, kIndex, + // Note: keep kInvalid the last value when adding new enum values. + kInvalid }; } // namespace rocksdb diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index afcbbaee4f5..35beb79502b 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -220,6 +220,26 @@ Status BlockFetcher::ReadBlockContents() { &slice_, used_buf_); } PERF_COUNTER_ADD(block_read_count, 1); + + // TODO: introduce dedicated perf counter for range tombstones + switch (block_type_) { + case BlockType::kFilter: + PERF_COUNTER_ADD(filter_block_read_count, 1); + break; + + case BlockType::kCompressionDictionary: + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(index_block_read_count, 1); + break; + + // Nothing to do here as we don't have counters for the other types. + default: + break; + } + PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize); if (!status_.ok()) { return status_; diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 6451d6d2acc..06e5d9dfa31 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -10,6 +10,7 @@ #pragma once #include "memory/memory_allocator.h" #include "table/block_based/block.h" +#include "table/block_based/block_type.h" #include "table/format.h" namespace rocksdb { @@ -39,7 +40,7 @@ class BlockFetcher { FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& read_options, const BlockHandle& handle, BlockContents* contents, const ImmutableCFOptions& ioptions, - bool do_uncompress, bool maybe_compressed, + bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, MemoryAllocator* memory_allocator = nullptr, @@ -53,6 +54,7 @@ class BlockFetcher { ioptions_(ioptions), do_uncompress_(do_uncompress), maybe_compressed_(maybe_compressed), + block_type_(block_type), uncompression_dict_(uncompression_dict), cache_options_(cache_options), memory_allocator_(memory_allocator), @@ -72,6 +74,7 @@ class BlockFetcher { const ImmutableCFOptions& ioptions_; bool do_uncompress_; bool maybe_compressed_; + BlockType block_type_; const UncompressionDict& uncompression_dict_; const PersistentCacheOptions& cache_options_; MemoryAllocator* memory_allocator_; diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 341a1185579..7bbbc7966de 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -216,7 +216,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, BlockFetcher block_fetcher( file, prefetch_buffer, footer, read_options, handle, &block_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options, + memory_allocator); s = block_fetcher.ReadBlockContents(); // property block is never compressed. Need to add uncompress logic if we are // to compress it.. @@ -375,8 +376,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, BlockFetcher block_fetcher( file, nullptr /* prefetch_buffer */, footer, read_options, metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -446,7 +447,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, file, nullptr /* prefetch_buffer */, footer, read_options, metaindex_handle, &metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options, + memory_allocator); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -467,7 +469,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, + const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool /*compression_type_missing*/, MemoryAllocator* memory_allocator) { Status status; @@ -488,6 +490,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options, metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, + BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); status = block_fetcher.ReadBlockContents(); @@ -515,7 +518,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, // Reading metablock BlockFetcher block_fetcher2( file, prefetch_buffer, footer, read_options, block_handle, contents, - ioptions, false /* decompress */, false /*maybe_compressed*/, + ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); return block_fetcher2.ReadBlockContents(); } diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 5224c54714d..86c703f953c 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -16,6 +16,7 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "table/block_based/block_builder.h" +#include "table/block_based/block_type.h" #include "table/format.h" #include "util/kv_map.h" @@ -143,7 +144,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, + const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool compression_type_missing = false, MemoryAllocator* memory_allocator = nullptr); diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 15f7be1c253..2f8f300d871 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -299,7 +299,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents, + BlockType::kIndex, &index_block_contents, true /* compression_type_missing */); bool index_in_file = s.ok(); @@ -310,7 +310,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (index_in_file) { s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, - BloomBlockBuilder::kBloomBlock, &bloom_block_contents, + BloomBlockBuilder::kBloomBlock, BlockType::kFilter, + &bloom_block_contents, true /* compression_type_missing */); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } diff --git a/table/table_test.cc b/table/table_test.cc index c59c9d8c33f..e836f89a8df 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2268,6 +2268,8 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { if (index_and_filter_in_cache) { // data, index and filter block ASSERT_EQ(get_perf_context()->block_read_count, 3); + ASSERT_EQ(get_perf_context()->index_block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); } else { // just the data block ASSERT_EQ(get_perf_context()->block_read_count, 1); @@ -2293,9 +2295,12 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { if (bloom_filter_type == 0) { // with block-based, we read index and then the filter ASSERT_EQ(get_perf_context()->block_read_count, 2); + ASSERT_EQ(get_perf_context()->index_block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); } else { // with full-filter, we read filter first and then we stop ASSERT_EQ(get_perf_context()->block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); } } else { // filter is already in memory and it figures out that the key doesn't @@ -3565,7 +3570,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, &footer, kBlockBasedTableMagicNumber)); - auto BlockFetchHelper = [&](const BlockHandle& handle, + auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { ReadOptions read_options; read_options.verify_checksums = false; @@ -3574,8 +3579,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { BlockFetcher block_fetcher( file, nullptr /* prefetch_buffer */, footer, read_options, handle, contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options); + false /*maybe_compressed*/, block_type, + UncompressionDict::GetEmptyDict(), cache_options); ASSERT_OK(block_fetcher.ReadBlockContents()); }; @@ -3584,7 +3589,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { auto metaindex_handle = footer.metaindex_handle(); BlockContents metaindex_contents; - BlockFetchHelper(metaindex_handle, &metaindex_contents); + BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex, + &metaindex_contents); Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); @@ -3601,7 +3607,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { ASSERT_OK(properties_handle.DecodeFrom(&v)); BlockContents properties_contents; - BlockFetchHelper(properties_handle, &properties_contents); + BlockFetchHelper(properties_handle, BlockType::kProperties, + &properties_contents); Block properties_block(std::move(properties_contents), kDisableGlobalSequenceNumber); @@ -3660,8 +3667,9 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { BlockFetcher block_fetcher( table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - pcache_opts, nullptr /*memory_allocator*/); + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), pcache_opts, + nullptr /*memory_allocator*/); ASSERT_OK(block_fetcher.ReadBlockContents()); Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); From fe90ed7a70b6cda47ef970375e74b9a0b486cab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 19 Jun 2019 08:02:21 -0700 Subject: [PATCH 165/572] Replace Corruption with TryAgain status when new tail is not visible to TransactionLogIterator (#5474) Summary: When tailing the WAL with TransactionLogIterator, it used to return Corruption status to indicate that the WAL has new tail that is not visible to the iterator, which is a misleading status. The patch replaces it with TryAgain which is more descriptive of a status, indicating that the user needs to create a new iterator to fetch the recent tail. Fixes https://github.com/facebook/rocksdb/issues/5455 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5474 Differential Revision: D15898953 Pulled By: maysamyabandeh fbshipit-source-id: 40966f6457cb539e1aeb104daeada6b0e46059fc --- HISTORY.md | 1 + db/transaction_log_impl.cc | 3 ++- db/wal_manager_test.cc | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 18feefafce8..825c1def47c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -29,6 +29,7 @@ * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. +* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. ## 6.2.0 (4/30/2019) diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 2e4475bb6ac..8c526af12ae 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -199,7 +199,8 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { if (current_last_seq_ == versions_->LastSequence()) { current_status_ = Status::OK(); } else { - current_status_ = Status::Corruption("NO MORE DATA LEFT"); + const char* msg = "Create a new iterator to fetch the new tail."; + current_status_ = Status::TryAgain(msg); } return; } diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 1bc6a8afe83..671dc84e1b8 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -293,6 +293,29 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { // Check that an empty iterator is returned ASSERT_TRUE(!iter->Valid()); } + +TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) { + Init(); + CreateArchiveLogs(2, 100); + auto iter = OpenTransactionLogIter(0); + CreateArchiveLogs(1, 100); + int i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 200); + // A new log file was added after the iterator was created. + // TryAgain indicates a new iterator is needed to fetch the new data + ASSERT_TRUE(iter->status().IsTryAgain()); + + iter = OpenTransactionLogIter(0); + i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 300); + ASSERT_TRUE(iter->status().ok()); +} } // namespace rocksdb From 24b118ad986e656a11b94ad441cd455830bac7b2 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Wed, 19 Jun 2019 14:07:36 -0700 Subject: [PATCH 166/572] Combine the read-ahead logic for user reads and compaction reads (#5431) Summary: Currently the read-ahead logic for user reads and compaction reads go through different code paths where compaction reads create new table readers and use `ReadaheadRandomAccessFile`. This change is to unify read-ahead logic to use read-ahead in BlockBasedTableReader::InitDataBlock(). As a result of the change `ReadAheadRandomAccessFile` class and `new_table_reader_for_compaction_inputs` option will no longer be used. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5431 Test Plan: make check Here is the benchmarking - https://gist.github.com/vjnadimpalli/083cf423f7b6aa12dcdb14c858bc18a5 Differential Revision: D15772533 Pulled By: vjnadimpalli fbshipit-source-id: b71dca710590471ede6fb37553388654e2e479b9 --- db/db_compaction_test.cc | 21 ++--- db/table_cache.cc | 76 ++++--------------- db/table_cache.h | 3 +- include/rocksdb/options.h | 2 + table/block_based/block_based_table_reader.cc | 39 ++++++---- table/block_based/block_based_table_reader.h | 19 ++++- table/block_fetcher.cc | 5 +- table/block_fetcher.h | 8 +- table/cuckoo/cuckoo_table_reader.cc | 3 +- table/cuckoo/cuckoo_table_reader.h | 6 +- table/meta_blocks.cc | 11 ++- table/mock_table.cc | 3 +- table/mock_table.h | 3 +- table/plain/plain_table_reader.cc | 3 +- table/plain/plain_table_reader.h | 6 +- table/table_reader.h | 11 +-- table/table_test.cc | 4 +- util/file_reader_writer.cc | 16 ++-- util/file_reader_writer.h | 11 ++- 19 files changed, 124 insertions(+), 126 deletions(-) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 6537950fcc7..7f639c85397 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -497,14 +497,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { // Create new iterator for: // (1) 1 for verifying flush results - // (2) 3 for compaction input files - // (3) 1 for verifying compaction results. - ASSERT_EQ(num_new_table_reader, 5); + // (2) 1 for verifying compaction results. + // (3) New TableReaders will not be created for compaction inputs + ASSERT_EQ(num_new_table_reader, 2); num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5); ASSERT_EQ(num_new_table_reader, 0); num_table_cache_lookup = 0; @@ -519,13 +519,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { // May preload table cache too. ASSERT_GE(num_table_cache_lookup, 1); old_num_table_cache_lookup2 = num_table_cache_lookup; - // One for compaction input, one for verifying compaction results. - ASSERT_EQ(num_new_table_reader, 2); + // One for verifying compaction results. + // No new iterator created for compaction. + ASSERT_EQ(num_new_table_reader, 1); num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); ASSERT_EQ(num_new_table_reader, 0); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -4339,12 +4340,6 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) { options.env = new MockEnv(Env::Default()); Reopen(options); bool readahead = false; - SyncPoint::GetInstance()->SetCallBack( - "TableCache::NewIterator:for_compaction", [&](void* arg) { - bool* use_direct_reads = static_cast(arg); - ASSERT_EQ(*use_direct_reads, - options.use_direct_reads); - }); SyncPoint::GetInstance()->SetCallBack( "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { bool* use_direct_writes = static_cast(arg); diff --git a/db/table_cache.cc b/db/table_cache.cc index 0a152f89a16..bbfaf32e09e 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -16,6 +16,7 @@ #include "monitoring/perf_context_imp.h" #include "rocksdb/statistics.h" +#include "table/block_based/block_based_table_reader.h" #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" @@ -43,13 +44,6 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } -static void DeleteTableReader(void* arg1, void* arg2) { - TableReader* table_reader = reinterpret_cast(arg1); - Statistics* stats = reinterpret_cast(arg2); - RecordTick(stats, NO_FILE_CLOSES); - delete table_reader; -} - static Slice GetSliceForFileNumber(const uint64_t* file_number) { return Slice(reinterpret_cast(file_number), sizeof(*file_number)); @@ -96,8 +90,8 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { Status TableCache::GetTableReader( const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, - bool sequential_mode, size_t readahead, bool record_read_stats, - HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, const SliceTransform* prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, bool for_compaction) { std::string fname = @@ -107,13 +101,6 @@ Status TableCache::GetTableReader( RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { - if (readahead > 0 && !env_options.use_mmap_reads) { - // Not compatible with mmap files since ReadaheadRandomAccessFile requires - // its wrapped file's Read() to copy data into the provided scratch - // buffer, which mmap files don't use. - // TODO(ajkr): try madvise for mmap files in place of buffered readahead. - file = NewReadaheadRandomAccessFile(std::move(file), readahead); - } if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } @@ -164,10 +151,9 @@ Status TableCache::FindTable(const EnvOptions& env_options, } std::unique_ptr table_reader; s = GetTableReader(env_options, internal_comparator, fd, - false /* sequential mode */, 0 /* readahead */, - record_read_stats, file_read_hist, &table_reader, - prefix_extractor, skip_filters, level, - prefetch_index_and_filter_in_cache); + false /* sequential mode */, record_read_stats, + file_read_hist, &table_reader, prefix_extractor, + skip_filters, level, prefetch_index_and_filter_in_cache); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.statistics, NO_FILE_ERRORS); @@ -196,48 +182,21 @@ InternalIterator* TableCache::NewIterator( PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; - bool create_new_table_reader = false; TableReader* table_reader = nullptr; Cache::Handle* handle = nullptr; if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } - size_t readahead = 0; - if (for_compaction) { -#ifndef NDEBUG - bool use_direct_reads_for_compaction = env_options.use_direct_reads; - TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction", - &use_direct_reads_for_compaction); -#endif // !NDEBUG - if (ioptions_.new_table_reader_for_compaction_inputs) { - // get compaction_readahead_size from env_options allows us to set the - // value dynamically - readahead = env_options.compaction_readahead_size; - create_new_table_reader = true; - } - } auto& fd = file_meta.fd; - if (create_new_table_reader) { - std::unique_ptr table_reader_unique_ptr; - s = GetTableReader( - env_options, icomparator, fd, true /* sequential_mode */, readahead, - !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr, - prefix_extractor, false /* skip_filters */, level, - true /* prefetch_index_and_filter_in_cache */, for_compaction); + table_reader = fd.table_reader; + if (table_reader == nullptr) { + s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record read_stats */, file_read_hist, + skip_filters, level); if (s.ok()) { - table_reader = table_reader_unique_ptr.release(); - } - } else { - table_reader = fd.table_reader; - if (table_reader == nullptr) { - s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record read_stats */, file_read_hist, - skip_filters, level); - if (s.ok()) { - table_reader = GetTableReaderFromHandle(handle); - } + table_reader = GetTableReaderFromHandle(handle); } } InternalIterator* result = nullptr; @@ -247,13 +206,10 @@ InternalIterator* TableCache::NewIterator( result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, - skip_filters, for_compaction); + skip_filters, for_compaction, + env_options.compaction_readahead_size); } - if (create_new_table_reader) { - assert(handle == nullptr); - result->RegisterCleanup(&DeleteTableReader, table_reader, - ioptions_.statistics); - } else if (handle != nullptr) { + if (handle != nullptr) { result->RegisterCleanup(&UnrefEntry, cache_, handle); handle = nullptr; // prevent from releasing below } diff --git a/db/table_cache.h b/db/table_cache.h index 1577cef82ff..dbf76039a23 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -177,8 +177,7 @@ class TableCache { Status GetTableReader(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, bool sequential_mode, - size_t readahead, bool record_read_stats, - HistogramImpl* file_read_hist, + bool record_read_stats, HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const SliceTransform* prefix_extractor = nullptr, bool skip_filters = false, int level = -1, diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index fe5617fb5c3..8ebcd292dba 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -760,6 +760,8 @@ struct DBOptions { // for this mode if using block-based table. // // Default: false + // This flag has no affect on the behavior of compaction and plan to delete + // in the future. bool new_table_reader_for_compaction_inputs = false; // If non-zero, we perform bigger reads when doing compaction. If you're diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 66fe34b95ea..9339c35364f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -83,12 +83,13 @@ Status ReadBlockFromFile( bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) { + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, + bool for_compaction = false) { BlockContents contents; - BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle, - &contents, ioptions, do_uncompress, - maybe_compressed, block_type, uncompression_dict, - cache_options, memory_allocator); + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, &contents, ioptions, + do_uncompress, maybe_compressed, block_type, uncompression_dict, + cache_options, memory_allocator, nullptr, for_compaction); Status s = block_fetcher.ReadBlockContents(); if (s.ok()) { result->reset(new Block(std::move(contents), global_seqno, @@ -1906,7 +1907,7 @@ CachableEntry BlockBasedTable::GetFilter( if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { return {rep_->filter_entry.GetValue(), /*cache=*/nullptr, - /*cache_handle=*/nullptr, /*own_value=*/false}; + /*cache_handle=*/nullptr, /*own_value=*/false}; } PERF_TIMER_GUARD(read_filter_block_nanos); @@ -2075,7 +2076,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, bool index_key_is_full, GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, - FilePrefetchBuffer* prefetch_buffer) const { + FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; @@ -2094,7 +2095,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( CachableEntry block; s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, - block_type, get_context, lookup_context); + block_type, get_context, lookup_context, for_compaction); if (!s.ok()) { assert(block.IsEmpty()); @@ -2144,6 +2145,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( s = block_cache->Insert(unique_key, nullptr, block.GetValue()->ApproximateMemoryUsage(), nullptr, &cache_handle); + if (s.ok()) { assert(cache_handle != nullptr); iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, @@ -2297,7 +2299,8 @@ Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction) const { assert(block_entry); assert(block_entry->IsEmpty()); @@ -2340,7 +2343,7 @@ Status BlockBasedTable::RetrieveBlock( block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0, - GetMemoryAllocator(rep_->table_options)); + GetMemoryAllocator(rep_->table_options), for_compaction); } if (!s.ok()) { @@ -2714,13 +2717,18 @@ void BlockBasedTableIterator::InitDataBlock() { rep->file.get(), read_options_.readahead_size, read_options_.readahead_size)); } + } else if (!prefetch_buffer_) { + prefetch_buffer_.reset( + new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_, + compaction_readahead_size_)); } Status s; table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, block_type_, key_includes_seq_, index_key_is_full_, - /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get()); + /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), + for_compaction_); block_iter_points_to_real_block_ = true; } } @@ -2806,7 +2814,8 @@ void BlockBasedTableIterator::CheckOutOfBound() { InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, bool for_compaction) { + Arena* arena, bool skip_filters, bool for_compaction, + size_t compaction_readahead_size) { BlockCacheLookupContext lookup_context{ for_compaction ? BlockCacheLookupCaller::kCompaction : BlockCacheLookupCaller::kUserIterator}; @@ -2823,7 +2832,8 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); + true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction, + compaction_readahead_size); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); @@ -2835,7 +2845,8 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); + true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction, + compaction_readahead_size); } } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 3c92621bdcd..be758c96798 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -123,6 +123,8 @@ class BlockBasedTable : public TableReader { // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). // @param skip_filters Disables loading/accessing the filter block + // compaction_readahead_size: its value will only be used if for_compaction = + // true InternalIterator* NewIterator( const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena = nullptr, bool skip_filters = false, @@ -131,7 +133,8 @@ class BlockBasedTable : public TableReader { // i.e., it will populate the block cache with blocks in the new SST // files. We treat those as a user is calling iterator for now. We should // differentiate the callers. - bool for_compaction = false) override; + bool for_compaction = false, + size_t compaction_readahead_size = 0) override; FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; @@ -234,7 +237,7 @@ class BlockBasedTable : public TableReader { TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, bool index_key_is_full, GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, - FilePrefetchBuffer* prefetch_buffer) const; + FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; class PartitionedIndexIteratorState; @@ -283,7 +286,8 @@ class BlockBasedTable : public TableReader { const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const; + BlockCacheLookupContext* lookup_context, + bool for_compaction = false) const; // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file @@ -596,6 +600,8 @@ struct BlockBasedTable::Rep { // Iterates over the contents of BlockBasedTable. template class BlockBasedTableIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true public: BlockBasedTableIterator(const BlockBasedTable* table, const ReadOptions& read_options, @@ -605,7 +611,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { const SliceTransform* prefix_extractor, BlockType block_type, bool key_includes_seq = true, bool index_key_is_full = true, - bool for_compaction = false) + bool for_compaction = false, + size_t compaction_readahead_size = 0) : InternalIteratorBase(false), table_(table), read_options_(read_options), @@ -621,6 +628,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { key_includes_seq_(key_includes_seq), index_key_is_full_(index_key_is_full), for_compaction_(for_compaction), + compaction_readahead_size_(compaction_readahead_size), lookup_context_(for_compaction ? BlockCacheLookupCaller::kCompaction : BlockCacheLookupCaller::kUserIterator) {} @@ -734,6 +742,9 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool index_key_is_full_; // If this iterator is created for compaction bool for_compaction_; + // Readahead size used in compaction, its value is used only if + // for_compaction_ = true + size_t compaction_readahead_size_; BlockHandle prev_index_value_; BlockCacheLookupContext lookup_context_; diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 35beb79502b..6fdddc37e49 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -93,7 +93,8 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (prefetch_buffer_ != nullptr && prefetch_buffer_->TryReadFromCache( handle_.offset(), - static_cast(handle_.size()) + kBlockTrailerSize, &slice_)) { + static_cast(handle_.size()) + kBlockTrailerSize, &slice_, + for_compaction_)) { block_size_ = static_cast(handle_.size()); CheckBlockChecksum(); if (!status_.ok()) { @@ -217,7 +218,7 @@ Status BlockFetcher::ReadBlockContents() { PERF_TIMER_GUARD(block_read_time); // Actual file read status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, - &slice_, used_buf_); + &slice_, used_buf_, for_compaction_); } PERF_COUNTER_ADD(block_read_count, 1); diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 06e5d9dfa31..f67c974becb 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -44,7 +44,8 @@ class BlockFetcher { const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, MemoryAllocator* memory_allocator = nullptr, - MemoryAllocator* memory_allocator_compressed = nullptr) + MemoryAllocator* memory_allocator_compressed = nullptr, + bool for_compaction = false) : file_(file), prefetch_buffer_(prefetch_buffer), footer_(footer), @@ -58,7 +59,9 @@ class BlockFetcher { uncompression_dict_(uncompression_dict), cache_options_(cache_options), memory_allocator_(memory_allocator), - memory_allocator_compressed_(memory_allocator_compressed) {} + memory_allocator_compressed_(memory_allocator_compressed), + for_compaction_(for_compaction) {} + Status ReadBlockContents(); CompressionType get_compression_type() const { return compression_type_; } @@ -88,6 +91,7 @@ class BlockFetcher { char stack_buf_[kDefaultStackBufferSize]; bool got_from_prefetch_buffer_ = false; rocksdb::CompressionType compression_type_; + bool for_compaction_ = false; // return true if found bool TryGetUncompressBlockFromPersistentCache(); diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 905528e9bbf..821743608e4 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -377,7 +377,8 @@ Slice CuckooTableIterator::value() const { InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, - bool /*skip_filters*/, bool /*for_compaction*/) { + bool /*skip_filters*/, bool /*for_compaction*/, + size_t /*compaction_readahead_size*/) { if (!status().ok()) { return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index 0080a76e158..cdb0302bd3d 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -45,11 +45,15 @@ class CuckooTableReader: public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; + // Returns a new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena = nullptr, bool skip_filters = false, - bool for_compaction = false) override; + bool for_compaction = false, + size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 7bbbc7966de..4205d298b6d 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -487,12 +487,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file, read_options.verify_checksums = false; PersistentCacheOptions cache_options; - BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options, - metaindex_handle, &metaindex_contents, ioptions, - false /* decompress */, false /*maybe_compressed*/, - BlockType::kMetaIndex, - UncompressionDict::GetEmptyDict(), cache_options, - memory_allocator); + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, read_options, metaindex_handle, + &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); status = block_fetcher.ReadBlockContents(); if (!status.ok()) { return status; diff --git a/table/mock_table.cc b/table/mock_table.cc index 9b250604803..4d55bf7c9a8 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -34,7 +34,8 @@ stl_wrappers::KVMap MakeMockFile( InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, - Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) { + Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/, + size_t /*compaction_readahead_size*/) { return new MockTableIterator(table_); } diff --git a/table/mock_table.h b/table/mock_table.h index 005de1c3dc2..6a5b5ab31cd 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -44,7 +44,8 @@ class MockTableReader : public TableReader { const SliceTransform* prefix_extractor, Arena* arena = nullptr, bool skip_filters = false, - bool for_compaction = false) override; + bool for_compaction = false, + size_t compaction_readahead_size = 0) override; Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 2f8f300d871..2f036e61ae1 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -196,7 +196,8 @@ void PlainTableReader::SetupForCompaction() { InternalIterator* PlainTableReader::NewIterator( const ReadOptions& options, const SliceTransform* /* prefix_extractor */, - Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { + Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/, + size_t /*compaction_readahead_size*/) { bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 774e2eb36ef..7a468bdb8c8 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -77,11 +77,15 @@ class PlainTableReader: public TableReader { bool full_scan_mode, const bool immortal_table = false, const SliceTransform* prefix_extractor = nullptr); + // Returns new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena = nullptr, bool skip_filters = false, - bool for_compaction = false) override; + bool for_compaction = false, + size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; diff --git a/table/table_reader.h b/table/table_reader.h index bf3289818d6..2904526e59b 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -44,11 +44,12 @@ class TableReader { // all the states but those allocated in arena. // skip_filters: disables checking the bloom filters even if they exist. This // option is effective only for block-based table format. - virtual InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) = 0; + // compaction_readahead_size: its value will only be used if for_compaction = + // true + virtual InternalIterator* NewIterator( + const ReadOptions&, const SliceTransform* prefix_extractor, + Arena* arena = nullptr, bool skip_filters = false, + bool for_compaction = false, size_t compaction_readahead_size = 0) = 0; virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& /*read_options*/) { diff --git a/table/table_test.cc b/table/table_test.cc index e836f89a8df..8e290368428 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -3590,7 +3590,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { BlockContents metaindex_contents; BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex, - &metaindex_contents); + &metaindex_contents); Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); @@ -3608,7 +3608,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { BlockContents properties_contents; BlockFetchHelper(properties_handle, BlockType::kProperties, - &properties_contents); + &properties_contents); Block properties_block(std::move(properties_contents), kDisableGlobalSequenceNumber); diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 2c4e0a39f67..0af4c2098f1 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -70,7 +70,7 @@ Status SequentialFileReader::Skip(uint64_t n) { } Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { + char* scratch, bool for_compaction) const { Status s; uint64_t elapsed = 0; { @@ -90,7 +90,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, buf.AllocateNewBuffer(read_size); while (buf.CurrentSize() < read_size) { size_t allowed; - if (for_compaction_ && rate_limiter_ != nullptr) { + if (for_compaction && rate_limiter_ != nullptr) { allowed = rate_limiter_->RequestToken( buf.Capacity() - buf.CurrentSize(), buf.Alignment(), Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead); @@ -134,7 +134,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, const char* res_scratch = nullptr; while (pos < n) { size_t allowed; - if (for_compaction_ && rate_limiter_ != nullptr) { + if (for_compaction && rate_limiter_ != nullptr) { if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { sw.DelayStart(); } @@ -711,7 +711,8 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { } // namespace Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, - uint64_t offset, size_t n) { + uint64_t offset, size_t n, + bool for_compaction) { size_t alignment = reader->file()->GetRequiredBufferAlignment(); size_t offset_ = static_cast(offset); uint64_t rounddown_offset = Rounddown(offset_, alignment); @@ -771,7 +772,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, Slice result; s = reader->Read(rounddown_offset + chunk_len, static_cast(roundup_len - chunk_len), &result, - buffer_.BufferStart() + chunk_len); + buffer_.BufferStart() + chunk_len, for_compaction); if (s.ok()) { buffer_offset_ = rounddown_offset; buffer_.Size(static_cast(chunk_len) + result.size()); @@ -780,7 +781,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, } bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, - Slice* result) { + Slice* result, bool for_compaction) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -797,7 +798,8 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, assert(file_reader_ != nullptr); assert(max_readahead_size_ >= readahead_size_); - Status s = Prefetch(file_reader_, offset, n + readahead_size_); + Status s = + Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); if (!s.ok()) { return false; } diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 5ec332fc7a1..01df1067ed9 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -158,7 +158,8 @@ class RandomAccessFileReader { RandomAccessFileReader(const RandomAccessFileReader&) = delete; RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; - Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, + bool for_compaction = false) const; Status Prefetch(uint64_t offset, size_t n) const { return file_->Prefetch(offset, n); @@ -343,7 +344,9 @@ class FilePrefetchBuffer { // reader : the file reader. // offset : the file offset to start reading from. // n : the number of bytes to read. - Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n); + // for_compaction : if prefetch is done for compaction read. + Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n, + bool for_compaction = false); // Tries returning the data for a file raed from this buffer, if that data is // in the buffer. @@ -354,7 +357,9 @@ class FilePrefetchBuffer { // offset : the file offset. // n : the number of bytes. // result : output buffer to put the data into. - bool TryReadFromCache(uint64_t offset, size_t n, Slice* result); + // for_compaction : if cache read is done for compaction read. + bool TryReadFromCache(uint64_t offset, size_t n, Slice* result, + bool for_compaction = false); // The minimum `offset` ever passed to TryReadFromCache(). This will nly be // tracked if track_min_offset = true. From 5830c619d5732017a542bbef1be69f3e92dcd5f1 Mon Sep 17 00:00:00 2001 From: Jurriaan Mous Date: Wed, 19 Jun 2019 14:39:19 -0700 Subject: [PATCH 167/572] Java: Make the generics of the Options interfaces more strict (#5461) Summary: Make the generics of the Options interfaces more strict so they are usable in a Kotlin Multiplatform expect/actual typealias implementation without causing a Violation of Finite Bound Restriction. This fix would enable the creation of a generic Kotlin multiplatform library by just typealiasing the JVM implementation to the current Java implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5461 Differential Revision: D15903288 Pulled By: sagar0 fbshipit-source-id: 75e83fdf5d2fcede40744a17e767563d6a4b0696 --- .../java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java | 2 +- .../rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java | 2 +- .../src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java | 2 +- java/src/main/java/org/rocksdb/DBOptionsInterface.java | 2 +- .../java/org/rocksdb/MutableColumnFamilyOptionsInterface.java | 2 +- java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java | 2 +- java/src/main/java/org/rocksdb/Options.java | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index ac8550f3ef7..532db473407 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -14,7 +14,7 @@ * Taken from include/rocksdb/advanced_options.h */ public interface AdvancedColumnFamilyOptionsInterface - { + > { /** * The minimum number of write buffers that will be merged together diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java index 3ec46712389..64a6f9dccc7 100644 --- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java @@ -12,7 +12,7 @@ * and MutableCFOptions in util/cf_options.h */ public interface AdvancedMutableColumnFamilyOptionsInterface - { + > { /** * The maximum number of write buffers that are built up in memory. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java index f88a21af2b0..3c8cd5d5182 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -6,7 +6,7 @@ package org.rocksdb; public interface ColumnFamilyOptionsInterface - + > extends AdvancedColumnFamilyOptionsInterface { /** diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index af9aa179bf4..611f4f5da71 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -8,7 +8,7 @@ import java.util.Collection; import java.util.List; -public interface DBOptionsInterface { +public interface DBOptionsInterface> { /** * Use this if your DB is very small (like under 1GB) and you don't want to diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java index c2efcc54b6b..4f4749646f8 100644 --- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -6,7 +6,7 @@ package org.rocksdb; public interface MutableColumnFamilyOptionsInterface - + > extends AdvancedMutableColumnFamilyOptionsInterface { /** diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java index 1715d69d093..00087a43cae 100644 --- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java @@ -1,7 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb; -public interface MutableDBOptionsInterface { +public interface MutableDBOptionsInterface> { /** * Specifies the maximum number of concurrent background jobs (both flushes diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 5831b1e298e..bb3c87aefd5 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -16,7 +16,7 @@ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). * * If {@link #dispose()} function is not called, then it will be GC'd - * automaticallyand native resources will be released as part of the process. + * automatically and native resources will be released as part of the process. */ public class Options extends RocksObject implements DBOptionsInterface, From 68614a9608f5d70a247cdcc4621a150141cfe72f Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Wed, 19 Jun 2019 16:42:59 -0700 Subject: [PATCH 168/572] Fix AlignedBuffer's usage in Encryption Env (#5396) Summary: The usage of `AlignedBuffer` in env_encryption.cc writes and reads to/from the AlignedBuffer's internal buffer directly without going through AlignedBuffer's APIs (like `Append` and `Read`), causing encapsulation to break in some cases. The writes are especially problematic as after the data is written to the buffer (directly using either memmove or memcpy), the size of the buffer is not updated ... causing the AlignedBuffer to lose track of the encapsulated buffer's current size. Fixed this by updating the buffer size after every write. Todo for later: Add an overloaded method to AlignedBuffer to support a memmove in addition to a memcopy. Encryption env does a memmove, and hence I couldn't switch to using `AlignedBuffer.Append()`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5396 Test Plan: `make check` Differential Revision: D15764756 Pulled By: sagar0 fbshipit-source-id: 2e24b52bd3b4b5056c5c1da157f91ddf89370183 --- env/env_encryption.cc | 87 ++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/env/env_encryption.cc b/env/env_encryption.cc index df1b0011a01..6be2137ed6e 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -195,23 +195,26 @@ class EncryptedWritableFile : public WritableFileWrapper { EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength) : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { } - Status Append(const Slice& data) override { + Status Append(const Slice& data) override { AlignedBuffer buf; Status status; - Slice dataToAppend(data); + Slice dataToAppend(data); if (data.size() > 0) { auto offset = file_->GetFileSize(); // size including prefix // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToAppend = Slice(buf.BufferStart(), data.size()); + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } - status = file_->Append(dataToAppend); + status = file_->Append(dataToAppend); if (!status.ok()) { return status; } @@ -221,18 +224,19 @@ class EncryptedWritableFile : public WritableFileWrapper { Status PositionedAppend(const Slice& data, uint64_t offset) override { AlignedBuffer buf; Status status; - Slice dataToAppend(data); + Slice dataToAppend(data); offset += prefixLength_; if (data.size() > 0) { // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToAppend = Slice(buf.BufferStart(), data.size()); + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } status = file_->PositionedAppend(dataToAppend, offset); if (!status.ok()) { @@ -325,18 +329,19 @@ class EncryptedRandomRWFile : public RandomRWFile { Status Write(uint64_t offset, const Slice& data) override { AlignedBuffer buf; Status status; - Slice dataToWrite(data); + Slice dataToWrite(data); offset += prefixLength_; if (data.size() > 0) { // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToWrite = Slice(buf.BufferStart(), data.size()); + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); } status = file_->Write(offset, dataToWrite); return status; @@ -393,13 +398,14 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; @@ -430,13 +436,14 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; @@ -467,12 +474,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -513,12 +521,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -554,12 +563,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -609,11 +619,13 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } else { - // File is new, initialize & write prefix + // File is new, initialize & write prefix provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Write(0, prefixSlice); if (!status.ok()) { return status; @@ -630,7 +642,7 @@ class EncryptedEnv : public EnvWrapper { return Status::OK(); } - // Store in *result the attributes of the children of the specified directory. + // Store in *result the attributes of the children of the specified directory. // In case the implementation lists the directory prior to iterating the files // and files are concurrently deleted, the deleted files will be omitted from // result. @@ -670,8 +682,7 @@ class EncryptedEnv : public EnvWrapper { EncryptionProvider *provider_; }; - -// Returns an Env that encrypts data when stored on disk and decrypts data when +// Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { return new EncryptedEnv(base_env, provider); @@ -694,14 +705,14 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not encrypting a full block. + // We're not encrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy plain data to block buffer + // Copy plain data to block buffer memmove(block + blockOffset, data, n); } auto status = EncryptBlock(blockIndex, block, (char*)scratch.data()); @@ -741,14 +752,14 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not decrypting a full block. + // We're not decrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy encrypted data to block buffer + // Copy encrypted data to block buffer memmove(block + blockOffset, data, n); } auto status = DecryptBlock(blockIndex, block, (char*)scratch.data()); @@ -807,7 +818,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); - // Encrypt nonce+counter + // Encrypt nonce+counter auto status = cipher_.Encrypt(scratch); if (!status.ok()) { return status; @@ -823,13 +834,13 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra // Decrypt a block of data at the given block index. // Length of data is equal to BlockSize(); Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) { - // For CTR decryption & encryption are the same + // For CTR decryption & encryption are the same return EncryptBlock(blockIndex, data, scratch); } // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. -// For optimal performance, the prefix length should be a multiple of +// For optimal performance, the prefix length should be a multiple of // the page size. size_t CTREncryptionProvider::GetPrefixLength() { return defaultPrefixLength; @@ -844,7 +855,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t & iv = Slice(prefix + blockSize, blockSize); } -// CreateNewPrefix initialized an allocated block of prefix memory +// CreateNewPrefix initialized an allocated block of prefix memory // for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, @@ -873,7 +884,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, return Status::OK(); } -// PopulateSecretPrefixPart initializes the data into a new prefix block +// PopulateSecretPrefixPart initializes the data into a new prefix block // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. @@ -908,7 +919,7 @@ Status CTREncryptionProvider::CreateCipherStream( return status; } - // Create cipher stream + // Create cipher stream return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result); } From 24f73436fbdfb2728250ebeb076d4a953af58ddc Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 20 Jun 2019 11:41:59 -0700 Subject: [PATCH 169/572] sanitize and limit block_size under 4GB (#5492) Summary: `Block::restart_index_`, `Block::restarts_`, and `Block::current_` are defined as uint32_t but `BlockBasedTableOptions::block_size` is defined as a size_t so user might see corruption as in https://github.com/facebook/rocksdb/issues/5486. This PR adds a check in `BlockBasedTableFactory::SanitizeOptions` to disallow such configurations. yiwu-arbug Pull Request resolved: https://github.com/facebook/rocksdb/pull/5492 Differential Revision: D15914047 Pulled By: miasantreble fbshipit-source-id: c943f153d967e15aee7f2795730ab8259e2be201 --- db/db_test.cc | 11 +++++++++++ table/block_based/block_based_table_factory.cc | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/db/db_test.cc b/db/db_test.cc index 0204f4d9f62..69e91923cd6 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -6156,6 +6156,17 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) { fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n", flushes_done.load(), threads_destroyed.load()); } + +TEST_F(DBTest, LargeBlockSizeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(0, "foo", "bar")); + BlockBasedTableOptions table_options; + table_options.block_size = 8LL*1024*1024*1024LL; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 00b13033f3d..96812e233b8 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -257,6 +257,10 @@ Status BlockBasedTableFactory::SanitizeOptions( return Status::InvalidArgument( "Block alignment requested but block size is not a power of 2"); } + if (table_options_.block_size > port::kMaxUint32) { + return Status::InvalidArgument( + "block size exceeds maximum number (4GiB) allowed"); + } if (table_options_.data_block_index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash && table_options_.data_block_hash_table_util_ratio <= 0) { From 0b0cb6f1a2f71eb4532416a959ebcf682ac9096b Mon Sep 17 00:00:00 2001 From: feilongliu Date: Thu, 20 Jun 2019 13:04:13 -0700 Subject: [PATCH 170/572] Fix segfalut in ~DBWithTTLImpl() when called after Close() (#5485) Summary: ~DBWithTTLImpl() fails after calling Close() function (will invoke the Close() function of DBImpl), because the Close() function deletes default_cf_handle_ which is used in the GetOptions() function called in ~DBWithTTLImpl(), hence lead to segfault. Fix by creating a Close() function for the DBWithTTLImpl class and do the close and the work originally in ~DBWithTTLImpl(). If the Close() function is not called, it will be called in the ~DBWithTTLImpl() function. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5485 Test Plan: make clean; USE_CLANG=1 make all check -j Differential Revision: D15924498 fbshipit-source-id: 567397fb972961059083a1ae0f9f99ff74872b78 --- utilities/ttl/db_ttl_impl.cc | 21 ++++++++++++++---- utilities/ttl/db_ttl_impl.h | 6 +++++ utilities/ttl/ttl_test.cc | 43 ++++++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 47049a13585..2c79d01ba12 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -34,12 +34,25 @@ void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, } // Open the db inside DBWithTTLImpl because options needs pointer to its ttl -DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {} +DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {} DBWithTTLImpl::~DBWithTTLImpl() { - // Need to stop background compaction before getting rid of the filter - CancelAllBackgroundWork(db_, /* wait = */ true); - delete GetOptions().compaction_filter; + if (!closed_) { + Close(); + } +} + +Status DBWithTTLImpl::Close() { + Status ret = Status::OK(); + if (!closed_) { + Options default_options = GetOptions(); + // Need to stop background compaction before getting rid of the filter + CancelAllBackgroundWork(db_, /* wait = */ true); + ret = db_->Close(); + delete default_options.compaction_filter; + closed_ = true; + } + return ret; } Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname, diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 593cd64a0fc..1111c13a79f 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -35,6 +35,8 @@ class DBWithTTLImpl : public DBWithTTL { virtual ~DBWithTTLImpl(); + virtual Status Close() override; + Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options, const std::string& column_family_name, ColumnFamilyHandle** handle, @@ -99,6 +101,10 @@ class DBWithTTLImpl : public DBWithTTL { void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); } void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override; + + private: + // remember whether the Close completes or not + bool closed_; }; class TtlIterator : public Iterator { diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index 38c6affab8f..61f5e64497d 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -86,9 +86,24 @@ class TtlTest : public testing::Test { ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true)); } + // Call db_ttl_->Close() before delete db_ttl_ void CloseTtl() { - delete db_ttl_; - db_ttl_ = nullptr; + CloseTtlHelper(true); + } + + // No db_ttl_->Close() before delete db_ttl_ + void CloseTtlNoDBClose() { + CloseTtlHelper(false); + } + + void CloseTtlHelper(bool close_db) { + if (db_ttl_ != nullptr) { + if (close_db) { + db_ttl_->Close(); + } + delete db_ttl_; + db_ttl_ = nullptr; + } } // Populates and returns a kv-map @@ -401,6 +416,30 @@ TEST_F(TtlTest, NoEffect) { CloseTtl(); } + +// Rerun the NoEffect test with a different version of CloseTtl +// function, where db is directly deleted without close. +TEST_F(TtlTest, DestructWithoutClose) { + MakeKVMap(kSampleSize_); + int64_t boundary1 = kSampleSize_ / 3; + int64_t boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); //T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); //T=1: Set1 still there + CloseTtlNoDBClose(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); //T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); //T=2: Sets1 & 2 still there + CloseTtlNoDBClose(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); //T=4: Sets 1,2,3 still there + CloseTtlNoDBClose(); +} + // Puts a set of values and checks its presence using Get during ttl TEST_F(TtlTest, PresentDuringTTL) { MakeKVMap(kSampleSize_); From 705b8eecb49272fb100b0ae4b735829e9adf5ca9 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 20 Jun 2019 14:28:22 -0700 Subject: [PATCH 171/572] Add more callers for table reader. (#5454) Summary: This PR adds more callers for table readers. These information are only used for block cache analysis so that we can know which caller accesses a block. 1. It renames the BlockCacheLookupCaller to TableReaderCaller as passing the caller from upstream requires changes to table_reader.h and TableReaderCaller is a more appropriate name. 2. It adds more table reader callers in table/table_reader_caller.h, e.g., kCompactionRefill, kExternalSSTIngestion, and kBuildTable. This PR is long as it requires modification of interfaces in table_reader.h, e.g., NewIterator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5454 Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32. Differential Revision: D15819451 Pulled By: HaoyuHuang fbshipit-source-id: b6caa704c8fb96ddd15b9a934b7e7ea87f88092d --- db/builder.cc | 5 +- db/compaction/compaction_job.cc | 12 ++- db/convenience.cc | 2 +- db/db_impl/db_impl.cc | 2 +- db/external_sst_file_ingestion_job.cc | 6 +- db/forward_iterator.cc | 29 ++++- db/repair.cc | 6 +- db/table_cache.cc | 6 +- db/table_cache.h | 10 +- db/version_set.cc | 100 ++++++++++-------- db/version_set.h | 7 +- table/block_based/block_based_table_reader.cc | 48 ++++----- table/block_based/block_based_table_reader.h | 44 +++----- table/block_based/partitioned_filter_block.cc | 2 +- table/cuckoo/cuckoo_table_reader.cc | 2 +- table/cuckoo/cuckoo_table_reader.h | 8 +- table/cuckoo/cuckoo_table_reader_test.cc | 9 +- table/mock_table.cc | 3 +- table/mock_table.h | 9 +- table/plain/plain_table_reader.cc | 4 +- table/plain/plain_table_reader.h | 8 +- table/sst_file_reader.cc | 7 +- table/table_reader.h | 16 +-- table/table_reader_bench.cc | 4 +- table/table_reader_caller.h | 39 +++++++ table/table_test.cc | 73 ++++++++----- tools/block_cache_trace_analyzer.cc | 38 +++++-- tools/block_cache_trace_analyzer.h | 2 +- tools/block_cache_trace_analyzer_test.cc | 20 ++-- tools/sst_dump_tool.cc | 9 +- trace_replay/block_cache_tracer.cc | 10 +- trace_replay/block_cache_tracer.h | 24 ++--- trace_replay/block_cache_tracer_test.cc | 16 +-- 33 files changed, 337 insertions(+), 243 deletions(-) create mode 100644 table/table_reader_caller.h diff --git a/db/builder.cc b/db/builder.cc index 67d764ad18b..eac1b5fe2e1 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -221,8 +221,9 @@ Status BuildTable( mutable_cf_options.prefix_extractor.get(), nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), - false /* for_compaction */, nullptr /* arena */, - false /* skip_filter */, level)); + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr)); s = it->status(); if (s.ok() && paranoid_file_checks) { for (it->SeekToFirst(); it->Valid(); it->Next()) { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 65efedad5b4..db701d19dad 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -521,7 +521,7 @@ void CompactionJob::GenSubcompactionBoundaries() { // mutex to reduce contention db_mutex_->Unlock(); uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1, - /*for_compaction*/ true); + TableReaderCaller::kCompaction); db_mutex_->Lock(); ranges.emplace_back(a, b, size); sum += size; @@ -646,12 +646,14 @@ Status CompactionJob::Run() { // to cache it here for further user reads InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), env_options_, cfd->internal_comparator(), - *files_meta[file_idx], nullptr /* range_del_agg */, - prefix_extractor, nullptr, + *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), - false, nullptr /* arena */, false /* skip_filters */, - compact_->compaction->output_level()); + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { diff --git a/db/convenience.cc b/db/convenience.cc index c11653fb190..271217cd4f8 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -59,7 +59,7 @@ Status VerifySstFileChecksum(const Options& options, if (!s.ok()) { return s; } - s = table_reader->VerifyChecksum(); + s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum); return s; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 6341b76854c..f3fc96d8d1f 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2771,7 +2771,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) { sizes[i] += versions_->ApproximateSize( v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, - /*for_compaction=*/false); + TableReaderCaller::kUserApproximateSize); } if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 0068685b0ba..7e9657cc901 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -308,7 +308,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } if (ingestion_options_.verify_checksums_before_ingest) { - status = table_reader->VerifyChecksum(); + status = + table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion); } if (!status.ok()) { return status; @@ -368,7 +369,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // updating the block cache. ro.fill_cache = false; std::unique_ptr iter(table_reader->NewIterator( - ro, sv->mutable_cf_options.prefix_extractor.get())); + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); std::unique_ptr range_del_iter( table_reader->NewRangeTombstoneIterator(ro)); diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 2633a3ff9bd..c875008c769 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -79,7 +79,11 @@ class ForwardLevelIterator : public InternalIterator { read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), *files_[file_index_], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false); + prefix_extractor_, /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -642,7 +646,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - sv_->mutable_cf_options.prefix_extractor.get())); + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } BuildLevelIterators(vstorage); current_ = nullptr; @@ -714,7 +723,12 @@ void ForwardIterator::RenewIterators() { read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0_files_new[inew], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - svnew->mutable_cf_options.prefix_extractor.get())); + svnew->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } for (auto* f : l0_iters_) { @@ -772,8 +786,13 @@ void ForwardIterator::ResetIncompleteIterators() { DeleteIterator(l0_iters_[i]); l0_iters_[i] = cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), - *l0_files[i], nullptr /* range_del_agg */, - sv_->mutable_cf_options.prefix_extractor.get()); + *l0_files[i], /*range_del_agg=*/nullptr, + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/repair.cc b/db/repair.cc index 3ae46c6e7ee..8967b39f30b 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -520,7 +520,11 @@ class Repairer { InternalIterator* iter = table_cache_->NewIterator( ropts, env_options_, cfd->internal_comparator(), t->meta, nullptr /* range_del_agg */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, + /*level=*/-1, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; diff --git a/db/table_cache.cc b/db/table_cache.cc index bbfaf32e09e..b98d4b074ff 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -176,7 +176,7 @@ InternalIterator* TableCache::NewIterator( const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena, bool skip_filters, int level, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key) { PERF_TIMER_GUARD(new_table_iterator_nanos); @@ -187,7 +187,7 @@ InternalIterator* TableCache::NewIterator( if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } - + bool for_compaction = caller == TableReaderCaller::kCompaction; auto& fd = file_meta.fd; table_reader = fd.table_reader; if (table_reader == nullptr) { @@ -206,7 +206,7 @@ InternalIterator* TableCache::NewIterator( result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, - skip_filters, for_compaction, + skip_filters, caller, env_options.compaction_readahead_size); } if (handle != nullptr) { diff --git a/db/table_cache.h b/db/table_cache.h index dbf76039a23..f274337e952 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -70,12 +70,10 @@ class TableCache { const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, - const SliceTransform* prefix_extractor = nullptr, - TableReader** table_reader_ptr = nullptr, - HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, - Arena* arena = nullptr, bool skip_filters = false, int level = -1, - const InternalKey* smallest_compaction_key = nullptr, - const InternalKey* largest_compaction_key = nullptr); + const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, + HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, + bool skip_filters, int level, const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 9978c8cd463..8e2d21b051a 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -850,14 +850,15 @@ namespace { class LevelIterator final : public InternalIterator { public: - LevelIterator( - TableCache* table_cache, const ReadOptions& read_options, - const EnvOptions& env_options, const InternalKeyComparator& icomparator, - const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor, - bool should_sample, HistogramImpl* file_read_hist, bool for_compaction, - bool skip_filters, int level, RangeDelAggregator* range_del_agg, - const std::vector* compaction_boundaries = - nullptr) + LevelIterator(TableCache* table_cache, const ReadOptions& read_options, + const EnvOptions& env_options, + const InternalKeyComparator& icomparator, + const LevelFilesBrief* flevel, + const SliceTransform* prefix_extractor, bool should_sample, + HistogramImpl* file_read_hist, TableReaderCaller caller, + bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::vector* + compaction_boundaries = nullptr) : InternalIterator(false), table_cache_(table_cache), read_options_(read_options), @@ -868,7 +869,7 @@ class LevelIterator final : public InternalIterator { prefix_extractor_(prefix_extractor), file_read_hist_(file_read_hist), should_sample_(should_sample), - for_compaction_(for_compaction), + caller_(caller), skip_filters_(skip_filters), file_index_(flevel_->num_files), level_(level), @@ -957,9 +958,9 @@ class LevelIterator final : public InternalIterator { return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, - nullptr /* don't need reference to table */, - file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_, smallest_compaction_key, largest_compaction_key); + nullptr /* don't need reference to table */, file_read_hist_, caller_, + /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key, + largest_compaction_key); } TableCache* table_cache_; @@ -973,7 +974,7 @@ class LevelIterator final : public InternalIterator { HistogramImpl* file_read_hist_; bool should_sample_; - bool for_compaction_; + TableReaderCaller caller_; bool skip_filters_; size_t file_index_; int level_; @@ -1442,10 +1443,14 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), *file.file_metadata, - range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, arena, - false /* skip_filters */, 0 /* level */)); + read_options, soptions, cfd_->internal_comparator(), + *file.file_metadata, range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } if (should_sample) { // Count ones for every L0 files. This is done per iterator creation @@ -1466,8 +1471,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - false /* for_compaction */, IsFilterSkipped(level), level, - range_del_agg)); + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + range_del_agg, /*largest_compaction_key=*/nullptr)); } } @@ -1496,10 +1501,14 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, continue; } ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( - read_options, env_options, cfd_->internal_comparator(), *file->file_metadata, - &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, &arena, - false /* skip_filters */, 0 /* level */)); + read_options, env_options, cfd_->internal_comparator(), + *file->file_metadata, &range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, &arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); status = OverlapWithIterator( ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -1513,7 +1522,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - false /* for_compaction */, IsFilterSkipped(level), level, + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, &range_del_agg)); status = OverlapWithIterator( ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); @@ -4823,7 +4832,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, const Slice& end, int start_level, - int end_level, bool for_compaction) { + int end_level, TableReaderCaller caller) { // pre-condition assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); @@ -4844,7 +4853,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, if (!level) { // level 0 data is sorted order, handle the use case explicitly - size += ApproximateSizeLevel0(v, files_brief, start, end, for_compaction); + size += ApproximateSizeLevel0(v, files_brief, start, end, caller); continue; } @@ -4861,7 +4870,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, // inferred from the sorted order for (uint64_t i = idx_start; i < files_brief.num_files; i++) { uint64_t val; - val = ApproximateSize(v, files_brief.files[i], end, for_compaction); + val = ApproximateSize(v, files_brief.files[i], end, caller); if (!val) { // the files after this will not have the range break; @@ -4872,7 +4881,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, if (i == idx_start) { // subtract the bytes needed to be scanned to get to the starting // key - val = ApproximateSize(v, files_brief.files[i], start, for_compaction); + val = ApproximateSize(v, files_brief.files[i], start, caller); assert(size >= val); size -= val; } @@ -4886,15 +4895,15 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, const Slice& key_start, const Slice& key_end, - bool for_compaction) { + TableReaderCaller caller) { // level 0 files are not in sorted order, we need to iterate through // the list to compute the total bytes that require scanning uint64_t size = 0; for (size_t i = 0; i < files_brief.num_files; i++) { const uint64_t start = - ApproximateSize(v, files_brief.files[i], key_start, for_compaction); + ApproximateSize(v, files_brief.files[i], key_start, caller); const uint64_t end = - ApproximateSize(v, files_brief.files[i], key_end, for_compaction); + ApproximateSize(v, files_brief.files[i], key_end, caller); assert(end >= start); size += end - start; } @@ -4902,7 +4911,8 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v, } uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key, bool for_compaction) { + const Slice& key, + TableReaderCaller caller) { // pre-condition assert(v); @@ -4920,9 +4930,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, InternalIterator* iter = v->cfd_->table_cache()->NewIterator( ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), *f.file_metadata, nullptr /* range_del_agg */, - v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr); + v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr, + /*file_read_hist=*/nullptr, caller, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); if (table_reader_ptr != nullptr) { - result = table_reader_ptr->ApproximateOffsetOf(key, for_compaction); + result = table_reader_ptr->ApproximateOffsetOf(key, caller); } delete iter; } @@ -5001,10 +5015,12 @@ InternalIterator* VersionSet::MakeInputIterator( read_options, env_options_compactions, cfd->internal_comparator(), *flevel->files[i].file_metadata, range_del_agg, c->mutable_cf_options()->prefix_extractor.get(), - nullptr /* table_reader_ptr */, - nullptr /* no per level latency histogram */, - true /* for_compaction */, nullptr /* arena */, - false /* skip_filters */, static_cast(which) /* level */); + /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, + /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/static_cast(which), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); } } else { // Create concatenating iterator for the files from this level @@ -5012,10 +5028,10 @@ InternalIterator* VersionSet::MakeInputIterator( cfd->table_cache(), read_options, env_options_compactions, cfd->internal_comparator(), c->input_levels(which), c->mutable_cf_options()->prefix_extractor.get(), - false /* should_sample */, - nullptr /* no per level latency histogram */, - true /* for_compaction */, false /* skip_filters */, - static_cast(which) /* level */, range_del_agg, + /*should_sample=*/false, + /*no per level latency histogram=*/nullptr, + TableReaderCaller::kCompaction, /*skip_filters=*/false, + /*level=*/static_cast(which), range_del_agg, c->boundaries(which)); } } diff --git a/db/version_set.h b/db/version_set.h index ba1b4d3e3d0..6b7c42881c1 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -984,7 +984,8 @@ class VersionSet { // in levels [start_level, end_level). If end_level == 0 it will search // through all non-empty levels uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, - int start_level, int end_level, bool for_compaction); + int start_level, int end_level, + TableReaderCaller caller); // Return the size of the current manifest file uint64_t manifest_file_size() const { return manifest_file_size_; } @@ -1035,10 +1036,10 @@ class VersionSet { // ApproximateSize helper uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, const Slice& start, const Slice& end, - bool for_compaction); + TableReaderCaller caller); uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key, bool for_compaction); + const Slice& key, TableReaderCaller caller); // Save current contents to *log Status WriteSnapshot(log::Writer* log); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 9339c35364f..5b2f515006f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -349,7 +349,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { nullptr, kNullStats, true, index_key_includes_seq(), index_value_is_full()), false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, - index_key_includes_seq(), index_value_is_full()); + index_key_includes_seq(), index_value_is_full(), + lookup_context ? lookup_context->caller + : TableReaderCaller::kUncategorized); } assert(it != nullptr); @@ -365,7 +367,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { void CacheDependencies(bool pin) override { // Before read partitions, prefetch them to avoid lots of IOs - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; auto rep = table()->rep_; IndexBlockIter biter; BlockHandle handle; @@ -1075,7 +1077,7 @@ Status BlockBasedTable::Open( // Better not mutate rep_ after the creation. eg. internal_prefix_transform // raw pointer will be used to create HashIndexReader, whose reset may // access a dangling pointer. - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, internal_comparator, skip_filters, level, immortal_table); @@ -2681,7 +2683,7 @@ void BlockBasedTableIterator::InitDataBlock() { // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. // Explicit user requested readahead: // Enabled from the very first IO when ReadOptions.readahead_size is set. - if (!for_compaction_) { + if (lookup_context_.caller != TableReaderCaller::kCompaction) { if (read_options_.readahead_size == 0) { // Implicit auto readahead num_file_reads_++; @@ -2728,7 +2730,8 @@ void BlockBasedTableIterator::InitDataBlock() { read_options_, data_block_handle, &block_iter_, block_type_, key_includes_seq_, index_key_is_full_, /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), - for_compaction_); + /*for_compaction=*/lookup_context_.caller == + TableReaderCaller::kCompaction); block_iter_points_to_real_block_ = true; } } @@ -2814,11 +2817,8 @@ void BlockBasedTableIterator::CheckOutOfBound() { InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, bool for_compaction, - size_t compaction_readahead_size) { - BlockCacheLookupContext lookup_context{ - for_compaction ? BlockCacheLookupCaller::kCompaction - : BlockCacheLookupCaller::kUserIterator}; + Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) { + BlockCacheLookupContext lookup_context{caller}; bool need_upper_bound_check = PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); if (arena == nullptr) { @@ -2832,7 +2832,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction, + /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size); } else { auto* mem = @@ -2845,8 +2845,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction, - compaction_readahead_size); + /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size); } } @@ -2933,7 +2932,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, CachableEntry filter_entry; bool may_match; FilterBlockReader* filter = nullptr; - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet}; { if (!skip_filters) { filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, @@ -2989,7 +2988,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, break; } else { BlockCacheLookupContext lookup_data_block_context{ - BlockCacheLookupCaller::kUserGet}; + TableReaderCaller::kUserGet}; bool does_referenced_key_exist = false; DataBlockIter biter; uint64_t referenced_data_size = 0; @@ -3084,7 +3083,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet}; const bool no_io = read_options.read_tier == kBlockCacheTier; CachableEntry filter_entry; FilterBlockReader* filter = nullptr; @@ -3135,7 +3134,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( - BlockCacheLookupCaller::kUserMGet); + TableReaderCaller::kUserMultiGet); if (iiter->value().offset() != offset) { offset = iiter->value().offset(); biter.Invalidate(Status::OK()); @@ -3244,7 +3243,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, if (begin && end && comparator.Compare(*begin, *end) > 0) { return Status::InvalidArgument(*begin, *end); } - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter iiter_on_stack; auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, @@ -3299,9 +3298,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, return Status::OK(); } -Status BlockBasedTable::VerifyChecksum() { - // TODO(haoyu): This function is called by external sst ingestion and the - // verify checksum public API. We don't log its block cache accesses for now. +Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { Status s; // Check Meta blocks std::unique_ptr meta; @@ -3317,9 +3314,10 @@ Status BlockBasedTable::VerifyChecksum() { } // Check Data blocks IndexBlockIter iiter_on_stack; + BlockCacheLookupContext context{caller}; InternalIteratorBase* iiter = NewIndexIterator( ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, - /*get_context=*/nullptr, /*lookup_contex=*/nullptr); + /*get_context=*/nullptr, &context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr = @@ -3536,10 +3534,8 @@ Status BlockBasedTable::CreateIndexReader( } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, - bool for_compaction) { - BlockCacheLookupContext context( - for_compaction ? BlockCacheLookupCaller::kCompaction - : BlockCacheLookupCaller::kUserApproximateSize); + TableReaderCaller caller) { + BlockCacheLookupContext context(caller); std::unique_ptr> index_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index be758c96798..b03e67128e2 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -123,18 +123,13 @@ class BlockBasedTable : public TableReader { // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). // @param skip_filters Disables loading/accessing the filter block - // compaction_readahead_size: its value will only be used if for_compaction = - // true - InternalIterator* NewIterator( - const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, bool skip_filters = false, - // TODO(haoyu) 1. External SST ingestion sets for_compaction as false. 2. - // Compaction also sets it to false when paranoid_file_checks is true, - // i.e., it will populate the block cache with blocks in the new SST - // files. We treat those as a user is calling iterator for now. We should - // differentiate the callers. - bool for_compaction = false, - size_t compaction_readahead_size = 0) override; + // compaction_readahead_size: its value will only be used if caller = + // kCompaction. + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; @@ -160,7 +155,8 @@ class BlockBasedTable : public TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key, bool for_compaction) override; + uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) override; bool TEST_BlockInCache(const BlockHandle& handle) const; @@ -180,7 +176,7 @@ class BlockBasedTable : public TableReader { Status DumpTable(WritableFile* out_file, const SliceTransform* prefix_extractor = nullptr) override; - Status VerifyChecksum() override; + Status VerifyChecksum(TableReaderCaller caller) override; void Close() override; @@ -609,9 +605,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { InternalIteratorBase* index_iter, bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, - BlockType block_type, bool key_includes_seq = true, - bool index_key_is_full = true, - bool for_compaction = false, + BlockType block_type, bool key_includes_seq, + bool index_key_is_full, TableReaderCaller caller, size_t compaction_readahead_size = 0) : InternalIteratorBase(false), table_(table), @@ -627,11 +622,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { block_type_(block_type), key_includes_seq_(key_includes_seq), index_key_is_full_(index_key_is_full), - for_compaction_(for_compaction), - compaction_readahead_size_(compaction_readahead_size), - lookup_context_(for_compaction - ? BlockCacheLookupCaller::kCompaction - : BlockCacheLookupCaller::kUserIterator) {} + lookup_context_(caller), + compaction_readahead_size_(compaction_readahead_size) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -740,13 +732,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // If the keys in the blocks over which we iterate include 8 byte sequence bool key_includes_seq_; bool index_key_is_full_; - // If this iterator is created for compaction - bool for_compaction_; - // Readahead size used in compaction, its value is used only if - // for_compaction_ = true - size_t compaction_readahead_size_; BlockHandle prev_index_value_; BlockCacheLookupContext lookup_context_; + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; // All the below fields control iterator readahead static const size_t kInitAutoReadaheadSize = 8 * 1024; diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index e80085dfb5b..cce6744157e 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -277,7 +277,7 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { void PartitionedFilterBlockReader::CacheDependencies( bool pin, const SliceTransform* prefix_extractor) { // Before read partitions, prefetch them to avoid lots of IOs - BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch}; + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter biter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 821743608e4..30109ece6ce 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -377,7 +377,7 @@ Slice CuckooTableIterator::value() const { InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, - bool /*skip_filters*/, bool /*for_compaction*/, + bool /*skip_filters*/, TableReaderCaller /*caller*/, size_t /*compaction_readahead_size*/) { if (!status().ok()) { return NewErrorInternalIterator( diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index cdb0302bd3d..10db084259f 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -50,10 +50,8 @@ class CuckooTableReader: public TableReader { // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false, - size_t compaction_readahead_size = 0) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. @@ -61,7 +59,7 @@ class CuckooTableReader: public TableReader { // Following methods are not implemented for Cuckoo Table Reader uint64_t ApproximateOffsetOf(const Slice& /*key*/, - bool /*for_compaction*/ = false) override { + TableReaderCaller /*caller*/) override { return 0; } void SetupForCompaction() override {} diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index dd65ffe8490..dd1557db147 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -146,8 +146,9 @@ class CuckooReaderTest : public testing::Test { CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); - InternalIterator* it = - reader.NewIterator(ReadOptions(), nullptr, nullptr, false); + InternalIterator* it = reader.NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); it->SeekToFirst(); @@ -186,7 +187,9 @@ class CuckooReaderTest : public testing::Test { delete it; Arena arena; - it = reader.NewIterator(ReadOptions(), nullptr, &arena); + it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena, + /*skip_filters=*/false, + TableReaderCaller::kUncategorized); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); it->Seek(keys[num_items/2]); diff --git a/table/mock_table.cc b/table/mock_table.cc index 4d55bf7c9a8..022f9a63f52 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -34,8 +34,7 @@ stl_wrappers::KVMap MakeMockFile( InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, - Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/, - size_t /*compaction_readahead_size*/) { + Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, size_t /*compaction_readahead_size*/) { return new MockTableIterator(table_); } diff --git a/table/mock_table.h b/table/mock_table.h index 6a5b5ab31cd..4b886e63e25 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -42,17 +42,16 @@ class MockTableReader : public TableReader { InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false, - size_t compaction_readahead_size = 0) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; uint64_t ApproximateOffsetOf(const Slice& /*key*/, - bool /*for_compaction*/ = false) override { + TableReaderCaller /*caller*/) override { return 0; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 2f036e61ae1..f6c348fdbf9 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -196,7 +196,7 @@ void PlainTableReader::SetupForCompaction() { InternalIterator* PlainTableReader::NewIterator( const ReadOptions& options, const SliceTransform* /* prefix_extractor */, - Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/, + Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, size_t /*compaction_readahead_size*/) { bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; if (arena == nullptr) { @@ -616,7 +616,7 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, } uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, - bool /*for_compaction*/) { + TableReaderCaller /*caller*/) { return 0; } diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 7a468bdb8c8..f63649cacf8 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -82,10 +82,8 @@ class PlainTableReader: public TableReader { // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false, - size_t compaction_readahead_size = 0) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; @@ -94,7 +92,7 @@ class PlainTableReader: public TableReader { bool skip_filters = false) override; uint64_t ApproximateOffsetOf(const Slice& key, - bool for_compaction = false) override; + TableReaderCaller caller) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 54408bb50e9..7c3b91cc39a 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -65,8 +65,9 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& options) { auto sequence = options.snapshot != nullptr ? options.snapshot->GetSequenceNumber() : kMaxSequenceNumber; - auto internal_iter = - r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get()); + auto internal_iter = r->table_reader->NewIterator( + options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTFileReader); return NewDBIterator(r->options.env, options, r->ioptions, r->moptions, r->ioptions.user_comparator, internal_iter, sequence, r->moptions.max_sequential_skip_in_iterations, @@ -79,7 +80,7 @@ std::shared_ptr SstFileReader::GetTableProperties() } Status SstFileReader::VerifyChecksum() { - return rep_->table_reader->VerifyChecksum(); + return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader); } } // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h index 2904526e59b..1c879cb1f81 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -14,6 +14,7 @@ #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/multiget_context.h" +#include "table/table_reader_caller.h" namespace rocksdb { @@ -44,12 +45,11 @@ class TableReader { // all the states but those allocated in arena. // skip_filters: disables checking the bloom filters even if they exist. This // option is effective only for block-based table format. - // compaction_readahead_size: its value will only be used if for_compaction = - // true - virtual InternalIterator* NewIterator( - const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, bool skip_filters = false, - bool for_compaction = false, size_t compaction_readahead_size = 0) = 0; + // compaction_readahead_size: its value will only be used if caller = kCompaction + virtual InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, size_t compaction_readahead_size = 0) = 0; virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& /*read_options*/) { @@ -63,7 +63,7 @@ class TableReader { // E.g., the approximate offset of the last key in the table will // be close to the file length. virtual uint64_t ApproximateOffsetOf(const Slice& key, - bool for_compaction = false) = 0; + TableReaderCaller caller) = 0; // Set up the table for Compaction. Might change some parameters with // posix_fadvise @@ -122,7 +122,7 @@ class TableReader { } // check whether there is corruption in this db file - virtual Status VerifyChecksum() { + virtual Status VerifyChecksum(TableReaderCaller /*caller*/) { return Status::NotSupported("VerifyChecksum() not supported"); } diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 2ec7b2d0fb5..cec62df5949 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -198,7 +198,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Iterator* iter = nullptr; InternalIterator* iiter = nullptr; if (!through_db) { - iiter = table_reader->NewIterator(read_options, nullptr); + iiter = table_reader->NewIterator( + read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); } else { iter = db->NewIterator(read_options); } diff --git a/table/table_reader_caller.h b/table/table_reader_caller.h new file mode 100644 index 00000000000..90c64687197 --- /dev/null +++ b/table/table_reader_caller.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { +// A list of callers for a table reader. It is used to trace the caller that +// accesses on a block. This is only used for block cache tracing and analysis. +// A user may use kUncategorized if the caller is not interesting for analysis +// or the table reader is called in the test environment, e.g., unit test, table +// reader benchmark, etc. +enum TableReaderCaller : char { + kUserGet = 1, + kUserMultiGet = 2, + kUserIterator = 3, + kUserApproximateSize = 4, + kUserVerifyChecksum = 5, + kSSTDumpTool = 6, + kExternalSSTIngestion = 7, + kRepair = 8, + kPrefetch = 9, + kCompaction = 10, + // A compaction job may refill the block cache with blocks in the new SST + // files if paranoid_file_checks is true. + kCompactionRefill = 11, + // After building a table, it may load all its blocks into the block cache if + // paranoid_file_checks is true. + kFlush = 12, + // sst_file_reader. + kSSTFileReader = 13, + // A list of callers that are either not interesting for analysis or are + // calling from a test environment, e.g., unit test, benchmark, etc. + kUncategorized = 14, + // All callers should be added before kMaxBlockCacheLookupCaller. + kMaxBlockCacheLookupCaller +}; +} // namespace rocksdb diff --git a/table/table_test.cc b/table/table_test.cc index 8e290368428..2e2286efae4 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -370,7 +370,9 @@ class TableConstructor: public Constructor { InternalIterator* NewIterator( const SliceTransform* prefix_extractor) const override { ReadOptions ro; - InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor); + InternalIterator* iter = table_reader_->NewIterator( + ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false, + TableReaderCaller::kUncategorized); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -382,9 +384,11 @@ class TableConstructor: public Constructor { if (convert_to_internal_key_) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); const Slice skey = ikey.Encode(); - return table_reader_->ApproximateOffsetOf(skey); + return table_reader_->ApproximateOffsetOf( + skey, TableReaderCaller::kUncategorized); } - return table_reader_->ApproximateOffsetOf(key); + return table_reader_->ApproximateOffsetOf( + key, TableReaderCaller::kUncategorized); } virtual Status Reopen(const ImmutableCFOptions& ioptions, @@ -1538,8 +1542,9 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { auto* reader = c.GetTableReader(); ReadOptions ro; ro.total_order_seek = true; - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); + std::unique_ptr iter(reader->NewIterator( + ro, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); iter->Seek(InternalKey("b", 0, kTypeValue).Encode()); ASSERT_OK(iter->status()); @@ -1597,8 +1602,9 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) { for (int i = 0; i < 2; ++i) { ReadOptions ro; ro.total_order_seek = (i == 0); - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); + std::unique_ptr iter(reader->NewIterator( + ro, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); iter->Seek(key.Encode()); ASSERT_OK(iter->status()); @@ -1635,8 +1641,9 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) { const MutableCFOptions new_moptions(options); c.Reopen(new_ioptions, new_moptions); auto reader = c.GetTableReader(); - std::unique_ptr db_iter( - reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get())); + std::unique_ptr db_iter(reader->NewIterator( + ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup // only one kv @@ -1702,8 +1709,9 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { ASSERT_EQ(5u, props->num_data_blocks); // TODO(Zhongyi): update test to use MutableCFOptions - std::unique_ptr index_iter( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + std::unique_ptr index_iter(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // -- Find keys do not exist, but have common prefix. std::vector prefixes = {"001", "003", "005", "007", "009"}; @@ -1819,8 +1827,9 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { auto reader = c.GetTableReader(); ReadOptions ropt; ropt.read_tier = ReadTier::kBlockCacheTier; - std::unique_ptr iter( - reader->NewIterator(ropt, /* prefix_extractor */ nullptr)); + std::unique_ptr iter(reader->NewIterator( + ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); auto ikey = [](Slice user_key) { return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); @@ -3136,8 +3145,9 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { &kvmap); auto reader = c.GetTableReader(); - std::unique_ptr db_iter( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + std::unique_ptr db_iter(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup for (auto& kv : kvmap) { @@ -3329,8 +3339,9 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { EnvOptions(), ikc), std::move(file_reader), ss_rw.contents().size(), &table_reader); - return table_reader->NewIterator(ReadOptions(), - moptions.prefix_extractor.get()); + return table_reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); }; GetVersionAndGlobalSeqno(); @@ -3501,7 +3512,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { std::move(file_reader), ss_rw.contents().size(), &table_reader)); std::unique_ptr db_iter(table_reader->NewIterator( - ReadOptions(), moptions2.prefix_extractor.get())); + ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); int expected_key = 1; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { @@ -3795,8 +3807,9 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { auto reader = c.GetTableReader(); std::unique_ptr seek_iter; - seek_iter.reset( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + seek_iter.reset(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); for (int i = 0; i < 2; ++i) { ReadOptions ro; // for every kv, we seek using two method: Get() and Seek() @@ -3877,13 +3890,15 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) { Slice upper_bound_slice(upper_bound); read_opt.iterate_upper_bound = &upper_bound_slice; std::unique_ptr iter; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->SeekToFirst(); ASSERT_FALSE(iter->Valid()); ASSERT_TRUE(iter->IsOutOfBound()); - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("foo"); ASSERT_FALSE(iter->Valid()); ASSERT_TRUE(iter->IsOutOfBound()); @@ -3913,8 +3928,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { Slice ub_slice1(ub1); read_opt.iterate_upper_bound = &ub_slice1; std::unique_ptr iter; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("bar"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("bar", iter->key()); @@ -3924,8 +3940,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { std::string ub2 = "foo_after"; Slice ub_slice2(ub2); read_opt.iterate_upper_bound = &ub_slice2; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("foo"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("foo", iter->key()); diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index a8259de71b5..732094bf29b 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -67,18 +67,36 @@ std::string block_type_to_string(TraceType type) { return "InvalidType"; } -std::string caller_to_string(BlockCacheLookupCaller caller) { +std::string caller_to_string(TableReaderCaller caller) { switch (caller) { case kUserGet: return "Get"; - case kUserMGet: + case kUserMultiGet: return "MultiGet"; case kUserIterator: return "Iterator"; + case kUserApproximateSize: + return "ApproximateSize"; + case kUserVerifyChecksum: + return "VerifyChecksum"; + case kSSTDumpTool: + return "SSTDumpTool"; + case kExternalSSTIngestion: + return "ExternalSSTIngestion"; + case kRepair: + return "Repair"; case kPrefetch: return "Prefetch"; case kCompaction: return "Compaction"; + case kCompactionRefill: + return "CompactionRefill"; + case kFlush: + return "Flush"; + case kSSTFileReader: + return "SSTFileReader"; + case kUncategorized: + return "Uncategorized"; default: break; } @@ -450,10 +468,10 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { uint64_t total_num_blocks = 0; uint64_t total_num_accesses = 0; std::map bt_num_blocks_map; - std::map caller_num_access_map; - std::map> + std::map caller_num_access_map; + std::map> caller_bt_num_access_map; - std::map> + std::map> caller_level_num_access_map; for (auto const& cf_aggregates : cf_aggregates_map_) { // Stats per column family. @@ -462,12 +480,12 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { uint64_t cf_num_blocks = 0; std::map cf_bt_blocks; uint64_t cf_num_accesses = 0; - std::map cf_caller_num_accesses_map; - std::map> + std::map cf_caller_num_accesses_map; + std::map> cf_caller_level_num_accesses_map; - std::map> + std::map> cf_caller_file_num_accesses_map; - std::map> + std::map> cf_caller_bt_num_accesses_map; total_num_files += cf_aggregates.second.fd_aggregates_map.size(); for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { @@ -492,7 +510,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const { for (auto const& stats : block_access_info.second.caller_num_access_map) { // Stats per caller. - const BlockCacheLookupCaller caller = stats.first; + const TableReaderCaller caller = stats.first; const uint64_t num_accesses = stats.second; // Overall stats. total_num_accesses += num_accesses; diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 0690d14d0f3..c953ecf2164 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -72,7 +72,7 @@ struct BlockAccessInfo { std::map non_exist_key_num_access_map; // for keys do not exist in this block. uint64_t num_referenced_key_exist_in_block = 0; - std::map caller_num_access_map; + std::map caller_num_access_map; void AddAccess(const BlockCacheTraceRecord& access) { if (first_access_time == 0) { diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index df99e1f616e..c361ba054ac 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -61,23 +61,23 @@ class BlockCacheTracerTest : public testing::Test { EXPECT_OK(env_->DeleteDir(test_path_)); } - BlockCacheLookupCaller GetCaller(uint32_t key_id) { + TableReaderCaller GetCaller(uint32_t key_id) { uint32_t n = key_id % 5; switch (n) { case 0: - return BlockCacheLookupCaller::kPrefetch; + return TableReaderCaller::kPrefetch; case 1: - return BlockCacheLookupCaller::kCompaction; + return TableReaderCaller::kCompaction; case 2: - return BlockCacheLookupCaller::kUserGet; + return TableReaderCaller::kUserGet; case 3: - return BlockCacheLookupCaller::kUserMGet; + return TableReaderCaller::kUserMultiGet; case 4: - return BlockCacheLookupCaller::kUserIterator; + return TableReaderCaller::kUserIterator; } // This cannot happend. assert(false); - return BlockCacheLookupCaller::kUserGet; + return TableReaderCaller::kMaxBlockCacheLookupCaller; } void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id, @@ -124,15 +124,15 @@ class BlockCacheTracerTest : public testing::Test { ASSERT_GT(block_access_info.first_access_time, 0); ASSERT_GT(block_access_info.last_access_time, 0); ASSERT_EQ(1, block_access_info.caller_num_access_map.size()); - BlockCacheLookupCaller expected_caller = GetCaller(key_id); + TableReaderCaller expected_caller = GetCaller(key_id); ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) != block_access_info.caller_num_access_map.end()); ASSERT_EQ( 1, block_access_info.caller_num_access_map.find(expected_caller)->second); - if ((expected_caller == BlockCacheLookupCaller::kUserGet || - expected_caller == BlockCacheLookupCaller::kUserMGet) && + if ((expected_caller == TableReaderCaller::kUserGet || + expected_caller == TableReaderCaller::kUserMultiGet) && type == TraceType::kBlockTraceDataBlock) { ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys); ASSERT_EQ(1, block_access_info.key_num_access_map.size()); diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index ed5600194ad..260d15f303c 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -143,7 +143,7 @@ Status SstFileDumper::NewTableReader( } Status SstFileDumper::VerifyChecksum() { - return table_reader_->VerifyChecksum(); + return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool); } Status SstFileDumper::DumpTable(const std::string& out_filename) { @@ -173,7 +173,8 @@ uint64_t SstFileDumper::CalculateCompressedTableSize( TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, dest_writer.get())); std::unique_ptr iter(table_reader_->NewIterator( - ReadOptions(), moptions_.prefix_extractor.get())); + ReadOptions(), moptions_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { if (!iter->status().ok()) { fputs(iter->status().ToString().c_str(), stderr); @@ -299,7 +300,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, } InternalIterator* iter = table_reader_->NewIterator( - ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get()); + ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get(), + /*arena=*/nullptr, /*skip_filters=*/false, + TableReaderCaller::kSSTDumpTool); uint64_t i = 0; if (has_from) { InternalKey ikey; diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index a0f0676eecf..4c5ad011609 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -31,11 +31,11 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) { const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = "UnknownColumnFamily"; -bool BlockCacheTraceHelper::ShouldTraceReferencedKey( - TraceType block_type, BlockCacheLookupCaller caller) { +bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type, + TableReaderCaller caller) { return (block_type == TraceType::kBlockTraceDataBlock) && - (caller == BlockCacheLookupCaller::kUserGet || - caller == BlockCacheLookupCaller::kUserMGet); + (caller == TableReaderCaller::kUserGet || + caller == TableReaderCaller::kUserMultiGet); } BlockCacheTraceWriter::BlockCacheTraceWriter( @@ -182,7 +182,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::Incomplete( "Incomplete access record: Failed to read caller."); } - record->caller = static_cast(enc_slice[0]); + record->caller = static_cast(enc_slice[0]); enc_slice.remove_prefix(kCharSize); if (enc_slice.empty()) { return Status::Incomplete( diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index bf88133111e..e7f38db3c6d 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -11,21 +11,11 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/trace_reader_writer.h" +#include "table/table_reader_caller.h" #include "trace_replay/trace_replay.h" namespace rocksdb { -enum BlockCacheLookupCaller : char { - kUserGet = 1, - kUserMGet = 2, - kUserIterator = 3, - kUserApproximateSize = 4, - kPrefetch = 5, - kCompaction = 6, - // All callers should be added before kMaxBlockCacheLookupCaller. - kMaxBlockCacheLookupCaller -}; - // Lookup context for tracing block cache accesses. // We trace block accesses at five places: // 1. BlockBasedTable::GetFilter @@ -46,9 +36,8 @@ enum BlockCacheLookupCaller : char { // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or // kUserApproximateSize). struct BlockCacheLookupContext { - BlockCacheLookupContext(const BlockCacheLookupCaller& _caller) - : caller(_caller) {} - const BlockCacheLookupCaller caller; +BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} +const TableReaderCaller caller; // These are populated when we perform lookup/insert on block cache. The block // cache tracer uses these inforation when logging the block access at // BlockBasedTable::GET and BlockBasedTable::MultiGet. @@ -84,8 +73,7 @@ struct BlockCacheTraceRecord { std::string cf_name; uint32_t level = 0; uint64_t sst_fd_number = 0; - BlockCacheLookupCaller caller = - BlockCacheLookupCaller::kMaxBlockCacheLookupCaller; + TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller; Boolean is_cache_hit = Boolean::kFalse; Boolean no_insert = Boolean::kFalse; @@ -100,7 +88,7 @@ struct BlockCacheTraceRecord { BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, TraceType _block_type, uint64_t _block_size, uint64_t _cf_id, std::string _cf_name, uint32_t _level, - uint64_t _sst_fd_number, BlockCacheLookupCaller _caller, + uint64_t _sst_fd_number, TableReaderCaller _caller, bool _is_cache_hit, bool _no_insert, std::string _referenced_key = "", uint64_t _referenced_data_size = 0, @@ -134,7 +122,7 @@ struct BlockCacheTraceHeader { class BlockCacheTraceHelper { public: static bool ShouldTraceReferencedKey(TraceType block_type, - BlockCacheLookupCaller caller); + TableReaderCaller caller); static const std::string kUnknownColumnFamilyName; }; diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index 95fe16b8c8f..44cba7bfbd8 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -37,19 +37,19 @@ class BlockCacheTracerTest : public testing::Test { EXPECT_OK(env_->DeleteDir(test_path_)); } - BlockCacheLookupCaller GetCaller(uint32_t key_id) { + TableReaderCaller GetCaller(uint32_t key_id) { uint32_t n = key_id % 5; switch (n) { case 0: - return BlockCacheLookupCaller::kPrefetch; + return TableReaderCaller::kPrefetch; case 1: - return BlockCacheLookupCaller::kCompaction; + return TableReaderCaller::kCompaction; case 2: - return BlockCacheLookupCaller::kUserGet; + return TableReaderCaller::kUserGet; case 3: - return BlockCacheLookupCaller::kUserMGet; + return TableReaderCaller::kUserMultiGet; case 4: - return BlockCacheLookupCaller::kUserIterator; + return TableReaderCaller::kUserIterator; } assert(false); } @@ -121,8 +121,8 @@ class BlockCacheTracerTest : public testing::Test { ASSERT_EQ(Boolean::kFalse, record.is_cache_hit); ASSERT_EQ(Boolean::kFalse, record.no_insert); if (block_type == TraceType::kBlockTraceDataBlock && - (record.caller == BlockCacheLookupCaller::kUserGet || - record.caller == BlockCacheLookupCaller::kUserMGet)) { + (record.caller == TableReaderCaller::kUserGet || + record.caller == TableReaderCaller::kUserMultiGet)) { ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), record.referenced_key); ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block); From 1bfeffab2dbff7eaf74a61cf52d57cf1404ef159 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 20 Jun 2019 22:14:00 -0700 Subject: [PATCH 172/572] Stop printing after verification fails (#5493) Summary: Stop verification and printing once verification fails. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5493 Differential Revision: D15928992 Pulled By: riversand963 fbshipit-source-id: 699feac034a217d57280aa3fb50f5aba06adf317 --- tools/db_stress.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 5fd84258b1f..6a3e8bdefb1 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -4054,6 +4054,9 @@ class AtomicFlushStressTest : public StressTest { assert(num == iters.size()); std::vector statuses(num, Status::OK()); do { + if (shared->HasVerificationFailedYet()) { + break; + } size_t valid_cnt = 0; size_t idx = 0; for (auto& iter : iters) { From 2730fe693edf306aad11a48491cfe3be4c178a47 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Fri, 21 Jun 2019 10:12:29 -0700 Subject: [PATCH 173/572] Fix ingested file and direcotry not being sync (#5435) Summary: It it not safe to assume application had sync the SST file before ingest it into DB. Also the directory to put the ingested file needs to be fsync, otherwise the file can be lost. For integrity of RocksDB we need to sync the ingested file and directory before apply the change to manifest. Also syncing after writing global sequence when write_global_seqno=true was removed in https://github.com/facebook/rocksdb/issues/4172. Adding it back. Fixes https://github.com/facebook/rocksdb/issues/5287. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5435 Test Plan: Test ingest file with ldb command and observe fsync/fdatasync in strace output. Tried both move_files=true and move_files=false. https://gist.github.com/yiwu-arbug/650a4023f57979056d83485fa863bef9 More test suggestions are welcome. Differential Revision: D15941675 Pulled By: riversand963 fbshipit-source-id: 389533f3923065a96df2cdde23ff4724a1810d78 --- HISTORY.md | 1 + db/db_impl/db_impl.cc | 12 +---- db/db_impl/db_impl.h | 56 ++++++++++++--------- db/db_impl/db_impl_open.cc | 6 +-- db/external_sst_file_basic_test.cc | 56 +++++++++++++++++++++ db/external_sst_file_ingestion_job.cc | 67 +++++++++++++++++++++++-- db/external_sst_file_ingestion_job.h | 15 +++++- test_util/fault_injection_test_env.cc | 72 ++++++++++++++++++++++++++- test_util/fault_injection_test_env.h | 29 +++++++++++ 9 files changed, 270 insertions(+), 44 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 825c1def47c..975ece580d4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -29,6 +29,7 @@ * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. +* Fix ingested file and directory not being fsync. * Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index f3fc96d8d1f..e2de696ef57 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -861,16 +861,6 @@ Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { return ret_dir; } -Directory* DBImpl::Directories::GetDataDir(size_t path_id) const { - assert(path_id < data_dirs_.size()); - Directory* ret_dir = data_dirs_[path_id].get(); - if (ret_dir == nullptr) { - // Should use db_dir_ - return db_dir_.get(); - } - return ret_dir; -} - Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { @@ -3644,7 +3634,7 @@ Status DBImpl::IngestExternalFiles( auto* cfd = static_cast(arg.column_family)->cfd(); ingestion_jobs.emplace_back(env_, versions_.get(), cfd, immutable_db_options_, env_options_, - &snapshots_, arg.options); + &snapshots_, arg.options, &directories_); } std::vector> exec_results; for (size_t i = 0; i != num_cfs; ++i) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index e6d5a56e244..b5437c49543 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -77,6 +77,38 @@ struct JobContext; struct ExternalSstFileInfo; struct MemTableInfo; +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + Directory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + // While DB is the public interface of RocksDB, and DBImpl is the actual // class implementing it. It's the entrance of the core RocksdB engine. // All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a @@ -1047,30 +1079,6 @@ class DBImpl : public DB { } }; - // Class to maintain directories for all database paths other than main one. - class Directories { - public: - Status SetDirectories(Env* env, const std::string& dbname, - const std::string& wal_dir, - const std::vector& data_paths); - - Directory* GetDataDir(size_t path_id) const; - - Directory* GetWalDir() { - if (wal_dir_) { - return wal_dir_.get(); - } - return db_dir_.get(); - } - - Directory* GetDbDir() { return db_dir_.get(); } - - private: - std::unique_ptr db_dir_; - std::vector> data_dirs_; - std::unique_ptr wal_dir_; - }; - struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) : number(_number) {} void AddSize(uint64_t new_size) { size += new_size; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index eec7cf16aa7..13d6959d474 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -265,9 +265,9 @@ Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, return env->NewDirectory(dirname, directory); } -Status DBImpl::Directories::SetDirectories( - Env* env, const std::string& dbname, const std::string& wal_dir, - const std::vector& data_paths) { +Status Directories::SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_); if (!s.ok()) { return s; diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 91a422bed9e..ff7da502afb 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -9,6 +9,7 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" +#include "test_util/fault_injection_test_env.h" #include "test_util/testutil.h" namespace rocksdb { @@ -20,6 +21,7 @@ class ExternalSSTFileBasicTest public: ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") { sst_files_dir_ = dbname_ + "/sst_files/"; + fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default())); DestroyAndRecreateExternalSSTFilesDir(); } @@ -140,6 +142,7 @@ class ExternalSSTFileBasicTest protected: std::string sst_files_dir_; + std::unique_ptr fault_injection_test_env_; }; TEST_F(ExternalSSTFileBasicTest, Basic) { @@ -689,6 +692,59 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(ExternalSSTFileBasicTest, SyncFailure) { + Options options; + options.create_if_missing = true; + options.env = fault_injection_test_env_.get(); + + std::vector> test_cases = { + {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile", + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"}, + {"ExternalSstFileIngestionJob::BeforeSyncDir", + "ExternalSstFileIngestionJob::AfterSyncDir"}, + {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno", + "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}}; + + for (size_t i = 0; i < test_cases.size(); i++) { + SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(false); + }); + SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(true); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + if (i == 2) { + ASSERT_OK(Put("foo", "v1")); + } + + Options sst_file_writer_options; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + if (i == 0) { + ingest_opt.move_files = true; + } + const Snapshot* snapshot = db_->GetSnapshot(); + if (i == 2) { + ingest_opt.write_global_seqno = true; + } + ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok()); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); + } +} + TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { int kNumLevels = 7; Options options = CurrentOptions(); diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 7e9657cc901..44b50168566 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -7,11 +7,13 @@ #include "db/external_sst_file_ingestion_job.h" -#include #include +#include #include +#include #include +#include "db/db_impl/db_impl.h" #include "db/version_edit.h" #include "file/file_util.h" #include "table/merging_iterator.h" @@ -86,6 +88,7 @@ Status ExternalSstFileIngestionJob::Prepare( } // Copy/Move external files into DB + std::unordered_set ingestion_path_ids; for (IngestedFileInfo& f : files_to_ingest_) { f.fd = FileDescriptor(next_file_number++, 0, f.file_size); f.copy_file = false; @@ -95,8 +98,26 @@ Status ExternalSstFileIngestionJob::Prepare( f.fd.GetPathId()); if (ingestion_options_.move_files) { status = env_->LinkFile(path_outside_db, path_inside_db); - if (status.IsNotSupported() && - ingestion_options_.failed_move_fall_back_to_copy) { + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr file_to_sync; + status = env_->ReopenWritableFile(path_inside_db, &file_to_sync, + env_options_); + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; } @@ -107,6 +128,7 @@ Status ExternalSstFileIngestionJob::Prepare( if (f.copy_file) { TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", nullptr); + // CopyFile also sync the new file. status = CopyFile(env_, path_outside_db, path_inside_db, 0, db_options_.use_fsync); } @@ -115,8 +137,25 @@ Status ExternalSstFileIngestionJob::Prepare( break; } f.internal_file_path = path_inside_db; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->Fsync(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + // TODO: The following is duplicated with Cleanup(). if (!status.ok()) { // We failed, remove all files that we copied into the db for (IngestedFileInfo& f : files_to_ingest_) { @@ -559,6 +598,18 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( std::string seqno_val; PutFixed64(&seqno_val, seqno); status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(rwfile.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } if (!status.ok()) { return status; } @@ -599,6 +650,16 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( return true; } +template +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(); + } else { + return file->Sync(); + } +} + } // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index baa8e9f0f64..50f3944054f 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -20,6 +20,8 @@ namespace rocksdb { +class Directories; + struct IngestedFileInfo { // External file path std::string external_file_path; @@ -77,7 +79,8 @@ class ExternalSstFileIngestionJob { Env* env, VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, SnapshotList* db_snapshots, - const IngestExternalFileOptions& ingestion_options) + const IngestExternalFileOptions& ingestion_options, + Directories* directories) : env_(env), versions_(versions), cfd_(cfd), @@ -85,8 +88,11 @@ class ExternalSstFileIngestionJob { env_options_(env_options), db_snapshots_(db_snapshots), ingestion_options_(ingestion_options), + directories_(directories), job_start_time_(env_->NowMicros()), - consumed_seqno_(false) {} + consumed_seqno_(false) { + assert(directories != nullptr); + } // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, @@ -153,6 +159,10 @@ class ExternalSstFileIngestionJob { bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, int level); + // Helper method to sync given file. + template + Status SyncIngestedFile(TWritableFile* file); + Env* env_; VersionSet* versions_; ColumnFamilyData* cfd_; @@ -161,6 +171,7 @@ class ExternalSstFileIngestionJob { SnapshotList* db_snapshots_; autovector files_to_ingest_; const IngestExternalFileOptions& ingestion_options_; + Directories* directories_; VersionEdit edit_; uint64_t job_start_time_; bool consumed_seqno_; diff --git a/test_util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc index a591ff4b57b..5c47b7ea455 100644 --- a/test_util/fault_injection_test_env.cc +++ b/test_util/fault_injection_test_env.cc @@ -98,6 +98,9 @@ Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { } Status TestDirectory::Fsync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } env_->SyncDir(dirname_); return dir_->Fsync(); } @@ -158,6 +161,53 @@ Status TestWritableFile::Sync() { return Status::OK(); } +TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : target_(std::move(f)), file_opened_(true), env_(env) { + assert(target_ != nullptr); +} + +TestRandomRWFile::~TestRandomRWFile() { + if (file_opened_) { + Close(); + } +} + +Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Write(offset, data); +} + +Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomRWFile::Close() { + file_opened_ = false; + return target_->Close(); +} + +Status TestRandomRWFile::Flush() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Flush(); +} + +Status TestRandomRWFile::Sync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Sync(); +} + Status FaultInjectionTestEnv::NewDirectory(const std::string& name, std::unique_ptr* result) { std::unique_ptr r; @@ -220,6 +270,27 @@ Status FaultInjectionTestEnv::ReopenWritableFile( return s; } +Status FaultInjectionTestEnv::NewRandomRWFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = target()->NewRandomRWFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + Status FaultInjectionTestEnv::NewRandomAccessFile( const std::string& fname, std::unique_ptr* result, const EnvOptions& soptions) { @@ -238,7 +309,6 @@ Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), s.ToString().c_str()); } - assert(s.ok()); if (s.ok()) { UntrackFile(f); } diff --git a/test_util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h index d962acfd585..b68b3faedce 100644 --- a/test_util/fault_injection_test_env.h +++ b/test_util/fault_injection_test_env.h @@ -82,6 +82,31 @@ class TestWritableFile : public WritableFile { FaultInjectionTestEnv* env_; }; +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestRandomRWFile : public RandomRWFile { + public: + explicit TestRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestRandomRWFile(); + Status Write(uint64_t offset, const Slice& data) override; + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + Status Close() override; + Status Flush() override; + Status Sync() override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestEnv* env_; +}; + class TestDirectory : public Directory { public: explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, @@ -114,6 +139,10 @@ class FaultInjectionTestEnv : public EnvWrapper { std::unique_ptr* result, const EnvOptions& soptions) override; + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + Status NewRandomAccessFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& soptions) override; From 22028aa9ab27cf860b74d12e006f82ff551caee0 Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Fri, 21 Jun 2019 21:07:09 -0700 Subject: [PATCH 174/572] Compaction Reads should read no more than compaction_readahead_size bytes, when set! (#5498) Summary: As a result of https://github.com/facebook/rocksdb/issues/5431 the compaction_readahead_size given by a user was not used exactly, the reason being the code behind readahead for user-read and compaction-read was unified in the above PR and the behavior for user-read is to read readahead_size+n bytes (see FilePrefetchBuffer::TryReadFromCache method). Before the unification the ReadaheadRandomAccessFileReader used compaction_readahead_size as it is. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5498 Test Plan: Ran strace command : strace -e pread64 -f -T -t ./db_compaction_test --gtest_filter=DBCompactionTest.PartialManualCompaction In the test the compaction_readahead_size was configured to 2MB and verified the pread syscall did indeed request 2MB. Before the change it was requesting more than 2MB. Strace Output: strace: Process 3798982 attached Note: Google Test filter = DBCompactionTest.PartialManualCompaction [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBCompactionTest [ RUN ] DBCompactionTest.PartialManualCompaction strace: Process 3798983 attached strace: Process 3798984 attached strace: Process 3798985 attached strace: Process 3798986 attached strace: Process 3798987 attached strace: Process 3798992 attached [pid 3798987] 12:07:05 +++ exited with 0 +++ strace: Process 3798993 attached [pid 3798993] 12:07:05 +++ exited with 0 +++ strace: Process 3798994 attached strace: Process 3799008 attached strace: Process 3799009 attached [pid 3799008] 12:07:05 +++ exited with 0 +++ strace: Process 3799010 attached [pid 3799009] 12:07:05 +++ exited with 0 +++ strace: Process 3799011 attached [pid 3799010] 12:07:05 +++ exited with 0 +++ [pid 3799011] 12:07:05 +++ exited with 0 +++ strace: Process 3799012 attached [pid 3799012] 12:07:05 +++ exited with 0 +++ strace: Process 3799013 attached strace: Process 3799014 attached [pid 3799013] 12:07:05 +++ exited with 0 +++ strace: Process 3799015 attached [pid 3799014] 12:07:05 +++ exited with 0 +++ [pid 3799015] 12:07:05 +++ exited with 0 +++ strace: Process 3799016 attached [pid 3799016] 12:07:05 +++ exited with 0 +++ strace: Process 3799017 attached [pid 3799017] 12:07:05 +++ exited with 0 +++ strace: Process 3799019 attached [pid 3799019] 12:07:05 +++ exited with 0 +++ strace: Process 3799020 attached strace: Process 3799021 attached [pid 3799020] 12:07:05 +++ exited with 0 +++ [pid 3799021] 12:07:05 +++ exited with 0 +++ strace: Process 3799022 attached [pid 3799022] 12:07:05 +++ exited with 0 +++ strace: Process 3799023 attached [pid 3799023] 12:07:05 +++ exited with 0 +++ strace: Process 3799047 attached strace: Process 3799048 attached [pid 3799047] 12:07:06 +++ exited with 0 +++ [pid 3799048] 12:07:06 +++ exited with 0 +++ [pid 3798994] 12:07:06 +++ exited with 0 +++ strace: Process 3799052 attached [pid 3799052] 12:07:06 +++ exited with 0 +++ strace: Process 3799054 attached strace: Process 3799069 attached strace: Process 3799070 attached [pid 3799069] 12:07:06 +++ exited with 0 +++ strace: Process 3799071 attached [pid 3799070] 12:07:06 +++ exited with 0 +++ [pid 3799071] 12:07:06 +++ exited with 0 +++ strace: Process 3799072 attached strace: Process 3799073 attached [pid 3799072] 12:07:06 +++ exited with 0 +++ [pid 3799073] 12:07:06 +++ exited with 0 +++ strace: Process 3799074 attached [pid 3799074] 12:07:06 +++ exited with 0 +++ strace: Process 3799075 attached [pid 3799075] 12:07:06 +++ exited with 0 +++ strace: Process 3799076 attached [pid 3799076] 12:07:06 +++ exited with 0 +++ strace: Process 3799077 attached [pid 3799077] 12:07:06 +++ exited with 0 +++ strace: Process 3799078 attached [pid 3799078] 12:07:06 +++ exited with 0 +++ strace: Process 3799079 attached [pid 3799079] 12:07:06 +++ exited with 0 +++ strace: Process 3799080 attached [pid 3799080] 12:07:06 +++ exited with 0 +++ strace: Process 3799081 attached [pid 3799081] 12:07:06 +++ exited with 0 +++ strace: Process 3799082 attached [pid 3799082] 12:07:06 +++ exited with 0 +++ strace: Process 3799083 attached [pid 3799083] 12:07:06 +++ exited with 0 +++ strace: Process 3799086 attached strace: Process 3799087 attached [pid 3798984] 12:07:06 pread64(9, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000121> [pid 3798984] 12:07:06 pread64(9, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000106> [pid 3798984] 12:07:06 pread64(9, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000081> [pid 3798984] 12:07:06 pread64(9, "\0\v\3foo\2\7\0\0\0\0\0\0\0\270 \0\v\4foo\2\3\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000138> [pid 3798984] 12:07:06 pread64(11, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000097> [pid 3798984] 12:07:06 pread64(11, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086> [pid 3798984] 12:07:06 pread64(11, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000064> [pid 3798984] 12:07:06 pread64(11, "\0\v\3foo\2\21\0\0\0\0\0\0\0\270 \0\v\4foo\2\r\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000064> [pid 3798984] 12:07:06 pread64(12, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080> [pid 3798984] 12:07:06 pread64(12, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000090> [pid 3798984] 12:07:06 pread64(12, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000059> [pid 3798984] 12:07:06 pread64(12, "\0\v\3foo\2\33\0\0\0\0\0\0\0\270 \0\v\4foo\2\27\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065> [pid 3798984] 12:07:06 pread64(13, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000070> [pid 3798984] 12:07:06 pread64(13, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000059> [pid 3798984] 12:07:06 pread64(13, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000061> [pid 3798984] 12:07:06 pread64(13, "\0\v\3foo\2%\0\0\0\0\0\0\0\270 \0\v\4foo\2!\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065> [pid 3798984] 12:07:06 pread64(14, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000118> [pid 3798984] 12:07:06 pread64(14, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093> [pid 3798984] 12:07:06 pread64(14, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000050> [pid 3798984] 12:07:06 pread64(14, "\0\v\3foo\2/\0\0\0\0\0\0\0\270 \0\v\4foo\2+\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000082> [pid 3798984] 12:07:06 pread64(15, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080> [pid 3798984] 12:07:06 pread64(15, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086> [pid 3798984] 12:07:06 pread64(15, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000091> [pid 3798984] 12:07:06 pread64(15, "\0\v\3foo\0029\0\0\0\0\0\0\0\270 \0\v\4foo\0025\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000174> [pid 3798984] 12:07:06 pread64(16, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080> [pid 3798984] 12:07:06 pread64(16, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093> [pid 3798984] 12:07:06 pread64(16, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000194> [pid 3798984] 12:07:06 pread64(16, "\0\v\3foo\2C\0\0\0\0\0\0\0\270 \0\v\4foo\2?\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000086> [pid 3798984] 12:07:06 pread64(17, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000079> [pid 3798984] 12:07:06 pread64(17, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000047> [pid 3798984] 12:07:06 pread64(17, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000045> [pid 3798984] 12:07:06 pread64(17, "\0\v\3foo\2M\0\0\0\0\0\0\0\270 \0\v\4foo\2I\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000107> [pid 3798983] 12:07:06 pread64(17, "\0\v\200\10foo\2P\0\0\0\0\0\0)U?MSg_)j(roFn($e"..., 2097152, 0) = 11230 <0.000091> [pid 3798983] 12:07:06 pread64(17, "", 2085922, 11230) = 0 <0.000073> [pid 3798983] 12:07:06 pread64(16, "\0\v\200\10foo\2F\0\0\0\0\0\0k[h3%.OPH_^:\\S7T&"..., 2097152, 0) = 11230 <0.000083> [pid 3798983] 12:07:06 pread64(16, "", 2085922, 11230) = 0 <0.000078> [pid 3798983] 12:07:06 pread64(15, "\0\v\200\10foo\2<\0\0\0\0\0\0+qToi_c{*S+4:N(:"..., 2097152, 0) = 11230 <0.000095> [pid 3798983] 12:07:06 pread64(15, "", 2085922, 11230) = 0 <0.000067> [pid 3798983] 12:07:06 pread64(14, "\0\v\200\10foo\0022\0\0\0\0\0\0%hw%OMa\"}9I609Q!B"..., 2097152, 0) = 11230 <0.000111> [pid 3798983] 12:07:06 pread64(14, "", 2085922, 11230) = 0 <0.000093> [pid 3798983] 12:07:06 pread64(13, "\0\v\200\10foo\2(\0\0\0\0\0\0p}Y&mu^DcaSGb2&nP"..., 2097152, 0) = 11230 <0.000128> [pid 3798983] 12:07:06 pread64(13, "", 2085922, 11230) = 0 <0.000076> [pid 3798983] 12:07:06 pread64(12, "\0\v\200\10foo\2\36\0\0\0\0\0\0YIyW#]oSs^6VHfB<`"..., 2097152, 0) = 11230 <0.000092> [pid 3798983] 12:07:06 pread64(12, "", 2085922, 11230) = 0 <0.000073> [pid 3798983] 12:07:06 pread64(11, "\0\v\200\10foo\2\24\0\0\0\0\0\0mfF8Jel/*Zf :-#s("..., 2097152, 0) = 11230 <0.000088> [pid 3798983] 12:07:06 pread64(11, "", 2085922, 11230) = 0 <0.000067> [pid 3798983] 12:07:06 pread64(9, "\0\v\200\10foo\2\n\0\0\0\0\0\0\\X'cjiHX)D,RSj1X!"..., 2097152, 0) = 11230 <0.000115> [pid 3798983] 12:07:06 pread64(9, "", 2085922, 11230) = 0 <0.000073> [pid 3798983] 12:07:06 pread64(8, "\1\315\5 \36\30\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 754) = 53 <0.000098> [pid 3798983] 12:07:06 pread64(8, "\0\22\3rocksdb.properties;\215\5\0\0\0\0\1\0\0\0"..., 37, 717) = 37 <0.000064> [pid 3798983] 12:07:06 pread64(8, "\0$\4rocksdb.block.based.table.ind"..., 658, 59) = 658 <0.000074> [pid 3798983] 12:07:06 pread64(8, "\0\v\2foo\1\0\0\0\0\0\0\0\0\31\0\0\0\0\1\0\0\0\0\212\216\222P", 29, 30) = 29 <0.000064> [pid 3799086] 12:07:06 +++ exited with 0 +++ [pid 3799087] 12:07:06 +++ exited with 0 +++ [pid 3799054] 12:07:06 +++ exited with 0 +++ strace: Process 3799104 attached [pid 3799104] 12:07:06 +++ exited with 0 +++ [ OK ] DBCompactionTest.PartialManualCompaction (757 ms) [----------] 1 test from DBCompactionTest (758 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (759 ms total) [ PASSED ] 1 test. [pid 3798983] 12:07:06 +++ exited with 0 +++ [pid 3798984] 12:07:06 +++ exited with 0 +++ [pid 3798992] 12:07:06 +++ exited with 0 +++ [pid 3798986] 12:07:06 +++ exited with 0 +++ [pid 3798982] 12:07:06 +++ exited with 0 +++ [pid 3798985] 12:07:06 +++ exited with 0 +++ 12:07:06 +++ exited with 0 +++ Differential Revision: D15948422 Pulled By: vjnadimpalli fbshipit-source-id: 9b189d1e8675d290c7784e4b33e5d3b5761d2ac8 --- util/file_reader_writer.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 0af4c2098f1..bf88503339a 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -797,9 +797,12 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, if (readahead_size_ > 0) { assert(file_reader_ != nullptr); assert(max_readahead_size_ >= readahead_size_); - - Status s = - Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); + Status s; + if (for_compaction) { + s = Prefetch(file_reader_, offset, readahead_size_, for_compaction); + } else { + s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); + } if (!s.ok()) { return false; } From 68980df89cc67a553b589c0e9000cef9b60bd344 Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Mon, 24 Jun 2019 10:38:02 -0700 Subject: [PATCH 175/572] Also build compression libraries on AppVeyor CI (#5226) Summary: This adds some compression dependencies to AppVeyor CI (those whose builds can be easily scripted on Windows, i.e. Snappy, LZ4, and ZStd). Let's see if the CI passes ;-) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5226 Differential Revision: D15967223 fbshipit-source-id: 0914c613ac358cbb248df75cdee8099e836828dc --- appveyor.yml | 62 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 9dae40af8f7..6bdb164e84e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,15 +1,67 @@ version: 1.0.{build} + image: Visual Studio 2017 + +environment: + JAVA_HOME: C:\Program Files\Java\jdk1.8.0 + THIRDPARTY_HOME: $(APPVEYOR_BUILD_FOLDER)\thirdparty + SNAPPY_HOME: $(THIRDPARTY_HOME)\snappy-1.1.7 + SNAPPY_INCLUDE: $(SNAPPY_HOME);$(SNAPPY_HOME)\build + SNAPPY_LIB_DEBUG: $(SNAPPY_HOME)\build\Debug\snappy.lib + SNAPPY_LIB_RELEASE: $(SNAPPY_HOME)\build\Release\snappy.lib + LZ4_HOME: $(THIRDPARTY_HOME)\lz4-1.8.3 + LZ4_INCLUDE: $(LZ4_HOME)\lib + LZ4_LIB_DEBUG: $(LZ4_HOME)\visual\VS2010\bin\x64_Debug\liblz4_static.lib + LZ4_LIB_RELEASE: $(LZ4_HOME)\visual\VS2010\bin\x64_Release\liblz4_static.lib + ZSTD_HOME: $(THIRDPARTY_HOME)\zstd-1.4.0 + ZSTD_INCLUDE: $(ZSTD_HOME)\lib;$(ZSTD_HOME)\lib\dictBuilder + ZSTD_LIB_DEBUG: $(ZSTD_HOME)\build\VS2010\bin\x64_Debug\libzstd_static.lib + ZSTD_LIB_RELEASE: $(ZSTD_HOME)\build\VS2010\bin\x64_Release\libzstd_static.lib + +install: + - md %THIRDPARTY_HOME% + - echo "Building Snappy dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o snappy-1.1.7.zip https://github.com/google/snappy/archive/1.1.7.zip + - unzip snappy-1.1.7.zip + - cd snappy-1.1.7 + - mkdir build + - cd build + - cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. + - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64 + - echo "Building LZ4 dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o lz4-1.8.3.zip https://github.com/lz4/lz4/archive/v1.8.3.zip + - unzip lz4-1.8.3.zip + - cd lz4-1.8.3\visual\VS2010 + - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD lz4.sln /upgrade + - msbuild lz4.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild lz4.sln /p:Configuration=Release /p:Platform=x64 + - echo "Building ZStd dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o zstd-1.4.0.zip https://github.com/facebook/zstd/archive/v1.4.0.zip + - unzip zstd-1.4.0.zip + - cd zstd-1.4.0\build\VS2010 + - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD zstd.sln /upgrade + - msbuild zstd.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild zstd.sln /p:Configuration=Release /p:Platform=x64 + before_build: -- md %APPVEYOR_BUILD_FOLDER%\build -- cd %APPVEYOR_BUILD_FOLDER%\build -- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 -DJNI=1 .. -- cd .. + - md %APPVEYOR_BUILD_FOLDER%\build + - cd %APPVEYOR_BUILD_FOLDER%\build + - cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 .. + - cd .. build: project: build\rocksdb.sln parallel: true verbosity: normal + test: + test_script: -- ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8 + - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8 + +on_failure: + - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip From c92c58f84dff863ea0e41db2c31de3ae9d75a539 Mon Sep 17 00:00:00 2001 From: Jermy Li Date: Mon, 24 Jun 2019 11:32:45 -0700 Subject: [PATCH 176/572] JNI: Do not create 8M block cache for negative blockCacheSize values (#5465) Summary: As [BlockBasedTableConfig setBlockCacheSize()](https://github.com/facebook/rocksdb/blob/1966a7c055f6e182d627275051f5c09441aa922d/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java#L728) said, If cacheSize is non-positive, then cache will not be used. but when we configure a negative number or 0, there is an unexpected result: the block cache becomes 8M. - Allow 0 as a valid size. When block cache size is 0, an 8MB block cache is created, as it is the default C++ API behavior. Also updated the comment. - Set no_block_cache true if negative value is passed to block cache size, and no block cache will be created. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5465 Differential Revision: D15968788 Pulled By: sagar0 fbshipit-source-id: ee02d6e95841c9e2c316a64bfdf192d46ff5638a --- java/rocksjni/table.cc | 5 ++++- java/src/main/java/org/rocksdb/BlockBasedTableConfig.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 1ccc550ab62..a4504d917ab 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -85,7 +85,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( std::shared_ptr *pCache = reinterpret_cast *>(jblock_cache_handle); options.block_cache = *pCache; - } else if (jblock_cache_size > 0) { + } else if (jblock_cache_size >= 0) { if (jblock_cache_num_shard_bits > 0) { options.block_cache = rocksdb::NewLRUCache( static_cast(jblock_cache_size), @@ -94,6 +94,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.block_cache = rocksdb::NewLRUCache( static_cast(jblock_cache_size)); } + } else { + options.no_block_cache = true; + options.block_cache = nullptr; } } if (jpersistent_cache_handle > 0) { diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 4c88a0224c6..bf5c0c1a921 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -725,7 +725,7 @@ public long blockCacheSize() { /** * Set the size of the cache in bytes that will be used by RocksDB. - * If cacheSize is non-positive, then cache will not be used. + * If cacheSize is negative, then cache will not be used. * DEFAULT: 8M * * @param blockCacheSize block cache size in bytes From e731f4402258554812c46334dc0d9483e6cc769b Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Mon, 24 Jun 2019 16:08:17 -0700 Subject: [PATCH 177/572] C file should not include , it is a C++ header. (#5499) Summary: Include instead. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5499 Differential Revision: D15966937 Pulled By: miasantreble fbshipit-source-id: 2156c4329b91d26d447de94f1231264d52786350 --- util/crc32c_ppc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c index ce0b9f27ce6..888a4943eaa 100644 --- a/util/crc32c_ppc.c +++ b/util/crc32c_ppc.c @@ -6,7 +6,7 @@ // (found in the LICENSE.Apache file in the root directory). #define CRC_TABLE -#include +#include #include #include #include "util/crc32c_ppc_constants.h" From acb80534cac798d250ee85812f0e45112f2e4b66 Mon Sep 17 00:00:00 2001 From: Huisheng Liu Date: Mon, 24 Jun 2019 17:36:26 -0700 Subject: [PATCH 178/572] Fix build jemalloc api (#5470) Summary: There is a compile error on Windows with MSVC in malloc_stats.cc where malloc_stats_print is referenced. The compiler only knows je_malloc_stats_print from jemalloc.h. Adding JEMALLOC_NO_RENAME replaces malloc_stats_print with je_malloc_stats_print. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5470 Differential Revision: D15978720 fbshipit-source-id: c05757a2e89e2e015a661d9626c352e4f32f97e4 --- thirdparty.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty.inc b/thirdparty.inc index ed9d4c0f8db..25ecdab88c2 100644 --- a/thirdparty.inc +++ b/thirdparty.inc @@ -241,7 +241,7 @@ endif() if (WITH_JEMALLOC) message(STATUS "JEMALLOC library is enabled") - set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= ") + set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= -DJEMALLOC_NO_RENAME") if(DEFINED ENV{JEMALLOC_INCLUDE}) set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE}) From 554a6456aad5b46149e05eab41779778c51607f4 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 24 Jun 2019 20:38:20 -0700 Subject: [PATCH 179/572] Block cache trace analysis: Write time series graphs in csv files (#5490) Summary: This PR adds a feature in block cache trace analysis tool to write statistics into csv files. 1. The analysis tool supports grouping the number of accesses per second by various labels, e.g., block, column family, block type, or a combination of them. 2. It also computes reuse distance and reuse interval. Reuse distance: The cumulated size of unique blocks read between two consecutive accesses on the same block. Reuse interval: The time between two consecutive accesses on the same block. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5490 Differential Revision: D15901322 Pulled By: HaoyuHuang fbshipit-source-id: b5454fea408a32757a80be63de6fe1c8149ca70e --- tools/block_cache_trace_analyzer.cc | 484 ++++++++++++++++++++++- tools/block_cache_trace_analyzer.h | 58 ++- tools/block_cache_trace_analyzer_test.cc | 115 +++++- 3 files changed, 628 insertions(+), 29 deletions(-) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 732094bf29b..78753a21622 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "monitoring/histogram.h" #include "util/gflags_compat.h" @@ -42,12 +41,70 @@ DEFINE_bool(print_data_block_access_count_stats, false, DEFINE_int32(cache_sim_warmup_seconds, 0, "The number of seconds to warmup simulated caches. The hit/miss " "counters are reset after the warmup completes."); -DEFINE_string(output_miss_ratio_curve_path, "", - "The output file to save the computed miss ratios. File format: " - "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"); +DEFINE_string( + block_cache_analysis_result_dir, "", + "The directory that saves block cache analysis results. It contains 1) a " + "mrc file that saves the computed miss ratios for simulated caches. Its " + "format is " + "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several " + "\"label_access_timeline\" files that contain number of accesses per " + "second grouped by the label. File format: " + "time,label_1_access_per_second,label_2_access_per_second,...,label_N_" + "access_per_second where N is the number of unique labels found in the " + "trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" " + "csv files that contain the reuse distance/interval grouped by label. File " + "format: bucket,label_1,label_2,...,label_N. The first N buckets are " + "absolute values. The second N buckets are percentage values."); +DEFINE_string( + timeline_labels, "", + "Group the number of accesses per block per second using these labels. " + "Possible labels are a combination of the following: cf (column family), " + "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" " + "means the number of acccess per second is grouped by unique pairs of " + "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per " + "second across all possible labels."); +DEFINE_string(reuse_distance_labels, "", + "Group the reuse distance of a block using these labels. Reuse " + "distance is defined as the cumulated size of unique blocks read " + "between two consecutive accesses on the same block."); +DEFINE_string( + reuse_distance_buckets, "", + "Group blocks by their reuse distances given these buckets. For " + "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M " + "and 1G, respectively. The last bucket contains the number of blocks with " + "reuse distance larger than 1G. "); +DEFINE_string( + reuse_interval_labels, "", + "Group the reuse interval of a block using these labels. Reuse " + "interval is defined as the time between two consecutive accesses " + "on the same block."); +DEFINE_string( + reuse_interval_buckets, "", + "Group blocks by their reuse interval given these buckets. For " + "example, if 'reuse_distance_buckets' is '1,10,100', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse interval less than 1 second, between 1 second and 10 " + "seconds, between 10 seconds and 100 seconds, respectively. The last " + "bucket contains the number of blocks with reuse interval longer than 100 " + "seconds."); namespace rocksdb { namespace { + +const std::string kMissRatioCurveFileName = "mrc"; +const std::string kGroupbyBlock = "block"; +const std::string kGroupbyColumnFamily = "cf"; +const std::string kGroupbySSTFile = "sst"; +const std::string kGroupbyBlockType = "bt"; +const std::string kGroupbyCaller = "caller"; +const std::string kGroupbyLevel = "level"; +const std::string kGroupbyAll = "all"; +const std::set kGroupbyLabels{ + kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel, + kGroupbyBlockType, kGroupbyCaller, kGroupbyAll}; + std::string block_type_to_string(TraceType type) { switch (type) { case kBlockTraceFilterBlock: @@ -146,8 +203,9 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { trace_start_time_ = access.access_timestamp; } // access.access_timestamp is in microseconds. - if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <= - access.access_timestamp) { + if (!warmup_complete_ && + trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= + access.access_timestamp) { for (auto& sim_cache : sim_caches_) { sim_cache->reset_counter(); } @@ -162,14 +220,16 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { } } -void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const { +void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { if (!cache_simulator_) { return; } - if (output_miss_ratio_curve_path_.empty()) { + if (output_dir_.empty()) { return; } - std::ofstream out(output_miss_ratio_curve_path_); + const std::string output_miss_ratio_curve_path = + output_dir_ + "/" + kMissRatioCurveFileName; + std::ofstream out(output_miss_ratio_curve_path); if (!out.is_open()) { return; } @@ -203,14 +263,345 @@ void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const { out.close(); } +std::set BlockCacheTraceAnalyzer::ParseLabelStr( + const std::string& label_str) const { + std::stringstream ss(label_str); + std::set labels; + // label_str is in the form of "label1_label2_label3", e.g., cf_bt. + while (ss.good()) { + std::string label_name; + getline(ss, label_name, '_'); + if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) { + // Unknown label name. + fprintf(stderr, "Unknown label name %s, label string %s\n", + label_name.c_str(), label_str.c_str()); + return {}; + } + labels.insert(label_name); + } + return labels; +} + +std::string BlockCacheTraceAnalyzer::BuildLabel( + const std::set& labels, const std::string& cf_name, + uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller, + const std::string& block_key) const { + std::map label_value_map; + label_value_map[kGroupbyAll] = kGroupbyAll; + label_value_map[kGroupbyLevel] = std::to_string(level); + label_value_map[kGroupbyCaller] = caller_to_string(caller); + label_value_map[kGroupbySSTFile] = std::to_string(fd); + label_value_map[kGroupbyBlockType] = block_type_to_string(type); + label_value_map[kGroupbyColumnFamily] = cf_name; + label_value_map[kGroupbyBlock] = block_key; + // Concatenate the label values. + std::string label; + for (auto const& l : labels) { + label += label_value_map[l]; + label += "-"; + } + if (!label.empty()) { + label.pop_back(); + } + return label; +} + +void BlockCacheTraceAnalyzer::WriteAccessTimeline( + const std::string& label_str) const { + std::set labels = ParseLabelStr(label_str); + uint64_t start_time = port::kMaxUint64; + uint64_t end_time = 0; + std::map> label_access_timeline; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + for (auto const& timeline : + block_access_info.second.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string& block_key = block_access_info.first; + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, caller, block_key); + for (auto const& naccess : timeline.second) { + const uint64_t timestamp = naccess.first; + const uint64_t num = naccess.second; + label_access_timeline[label][timestamp] += num; + start_time = std::min(start_time, timestamp); + end_time = std::max(end_time, timestamp); + } + } + } + } + } + } + + // We have label_access_timeline now. Write them into a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_access_timeline"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (auto const& label : label_access_timeline) { + header += ","; + header += label.first; + } + out << header << std::endl; + std::string row; + for (uint64_t now = start_time; now <= end_time; now++) { + row = std::to_string(now); + for (auto const& label : label_access_timeline) { + auto it = label.second.find(now); + row += ","; + if (it != label.second.end()) { + row += std::to_string(it->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::WriteReuseDistance( + const std::string& label_str, + const std::set& distance_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_distance_num_reuses; + uint64_t total_num_reuses = 0; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + const std::string& block_key = block_access_info.first; + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); + if (label_distance_num_reuses.find(label) == + label_distance_num_reuses.end()) { + // The first time we encounter this label. + for (auto const& distance_bucket : distance_buckets) { + label_distance_num_reuses[label][distance_bucket] = 0; + } + } + for (auto const& reuse_distance : + block_access_info.second.reuse_distance_count) { + label_distance_num_reuses[label] + .upper_bound(reuse_distance.first) + ->second += reuse_distance.second; + total_num_reuses += reuse_distance.second; + } + } + } + } + } + + // We have label_naccesses and label_distance_num_reuses now. Write them into + // a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_reuse_distance"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_distance_num_reuses) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + // Absolute values. + for (auto const& bucket : distance_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_distance_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(it->second); + } + out << row << std::endl; + } + // Percentage values. + for (auto const& bucket : distance_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_distance_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, total_num_reuses)); + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats( + const std::string& label, const std::set& time_buckets, + const std::map timeline, + std::map>* label_time_num_reuses, + uint64_t* total_num_reuses) const { + assert(label_time_num_reuses); + assert(total_num_reuses); + if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) { + // The first time we encounter this label. + for (auto const& time_bucket : time_buckets) { + (*label_time_num_reuses)[label][time_bucket] = 0; + } + } + auto it = timeline.begin(); + const uint64_t prev_timestamp = it->first; + const uint64_t prev_num = it->second; + it++; + // Reused within one second. + if (prev_num > 1) { + (*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1; + *total_num_reuses += prev_num - 1; + } + while (it != timeline.end()) { + const uint64_t timestamp = it->first; + const uint64_t num = it->second; + const uint64_t reuse_interval = timestamp - prev_timestamp; + (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num; + *total_num_reuses += num; + } +} + +void BlockCacheTraceAnalyzer::WriteReuseInterval( + const std::string& label_str, + const std::set& time_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_time_num_reuses; + uint64_t total_num_reuses = 0; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + const std::string& block_key = block_access_info.first; + if (labels.find(kGroupbyCaller) != labels.end()) { + for (auto const& timeline : + block_access_info.second.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string label = BuildLabel(labels, cf_name, fd, level, + type, caller, block_key); + UpdateReuseIntervalStats(label, time_buckets, timeline.second, + &label_time_num_reuses, + &total_num_reuses); + } + continue; + } + // Does not group by caller so we need to flatten the access timeline. + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); + std::map timeline; + for (auto const& caller_timeline : + block_access_info.second.caller_num_accesses_timeline) { + for (auto const& time_naccess : caller_timeline.second) { + timeline[time_naccess.first] += time_naccess.second; + } + } + UpdateReuseIntervalStats(label, time_buckets, timeline, + &label_time_num_reuses, &total_num_reuses); + } + } + } + } + + // We have label_naccesses and label_interval_num_reuses now. Write them into + // a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_reuse_interval"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_time_num_reuses) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + // Absolute values. + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_time_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(it->second); + } + out << row << std::endl; + } + // Percentage values. + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_time_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, total_num_reuses)); + } + out << row << std::endl; + } + out.close(); +} + BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( - const std::string& trace_file_path, - const std::string& output_miss_ratio_curve_path, + const std::string& trace_file_path, const std::string& output_dir, std::unique_ptr&& cache_simulator) - : trace_file_path_(trace_file_path), - output_miss_ratio_curve_path_(output_miss_ratio_curve_path), - cache_simulator_(std::move(cache_simulator)) { - env_ = rocksdb::Env::Default(); + : env_(rocksdb::Env::Default()), + trace_file_path_(trace_file_path), + output_dir_(output_dir), + cache_simulator_(std::move(cache_simulator)) {} + +void BlockCacheTraceAnalyzer::ComputeReuseDistance( + BlockAccessInfo* info) const { + assert(info); + if (info->num_accesses == 0) { + return; + } + uint64_t reuse_distance = 0; + for (auto const& block_key : info->unique_blocks_since_last_access) { + auto const& it = block_info_map_.find(block_key); + // This block must exist. + assert(it != block_info_map_.end()); + reuse_distance += it->second->block_size; + } + info->reuse_distance_count[reuse_distance] += 1; + // We clear this hash set since this is the second access on this block. + info->unique_blocks_since_last_access.clear(); } void BlockCacheTraceAnalyzer::RecordAccess( @@ -223,7 +614,23 @@ void BlockCacheTraceAnalyzer::RecordAccess( file_aggr.block_type_aggregates_map[access.block_type]; BlockAccessInfo& block_access_info = block_type_aggr.block_access_info_map[access.block_key]; + ComputeReuseDistance(&block_access_info); block_access_info.AddAccess(access); + block_info_map_[access.block_key] = &block_access_info; + + // Add this block to all existing blocks. + for (auto& cf_aggregates : cf_aggregates_map_) { + for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + for (auto& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + for (auto& existing_block : + block_type_aggregates.second.block_access_info_map) { + existing_block.second.unique_blocks_since_last_access.insert( + access.block_key); + } + } + } + } } Status BlockCacheTraceAnalyzer::Analyze() { @@ -659,6 +1066,18 @@ std::vector parse_cache_config_file( return configs; } +std::set parse_buckets(const std::string& bucket_str) { + std::set buckets; + std::stringstream ss(bucket_str); + while (ss.good()) { + std::string bucket; + getline(ss, bucket, ','); + buckets.insert(ParseUint64(bucket)); + } + buckets.insert(port::kMaxUint64); + return buckets; +} + int block_cache_trace_analyzer_tool(int argc, char** argv) { ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_block_cache_trace_path.empty()) { @@ -678,7 +1097,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { warmup_seconds, downsample_ratio, cache_configs)); } BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, - FLAGS_output_miss_ratio_curve_path, + FLAGS_block_cache_analysis_result_dir, std::move(cache_simulator)); Status s = analyzer.Analyze(); if (!s.IsIncomplete()) { @@ -701,7 +1120,38 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.PrintDataBlockAccessStats(); } print_break_lines(/*num_break_lines=*/3); - analyzer.PrintMissRatioCurves(); + analyzer.WriteMissRatioCurves(); + + if (!FLAGS_timeline_labels.empty()) { + std::stringstream ss(FLAGS_timeline_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteAccessTimeline(label); + } + } + + if (!FLAGS_reuse_distance_labels.empty() && + !FLAGS_reuse_distance_buckets.empty()) { + std::set buckets = parse_buckets(FLAGS_reuse_distance_buckets); + std::stringstream ss(FLAGS_reuse_distance_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseDistance(label, buckets); + } + } + + if (!FLAGS_reuse_interval_labels.empty() && + !FLAGS_reuse_interval_buckets.empty()) { + std::set buckets = parse_buckets(FLAGS_reuse_interval_buckets); + std::stringstream ss(FLAGS_reuse_interval_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseInterval(label, buckets); + } + } return 0; } diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index c953ecf2164..21a99f7db76 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include "rocksdb/env.h" @@ -14,6 +15,8 @@ namespace rocksdb { +const uint64_t kMicrosInSecond = 1000000; + class BlockCacheTraceAnalyzer; // A cache configuration provided by user. @@ -73,6 +76,14 @@ struct BlockAccessInfo { non_exist_key_num_access_map; // for keys do not exist in this block. uint64_t num_referenced_key_exist_in_block = 0; std::map caller_num_access_map; + // caller:timestamp:number_of_accesses. The granularity of the timestamp is + // seconds. + std::map> + caller_num_accesses_timeline; + // Unique blocks since the last access. + std::set unique_blocks_since_last_access; + // Number of reuses grouped by reuse distance. + std::map reuse_distance_count; void AddAccess(const BlockCacheTraceRecord& access) { if (first_access_time == 0) { @@ -82,10 +93,13 @@ struct BlockAccessInfo { block_size = access.block_size; caller_num_access_map[access.caller]++; num_accesses++; + // access.access_timestamp is in microsecond. + const uint64_t timestamp_in_seconds = + access.access_timestamp / kMicrosInSecond; + caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type, access.caller)) { num_keys = access.num_keys_in_block; - if (access.referenced_key_exist_in_block == Boolean::kTrue) { key_num_access_map[access.referenced_key]++; num_referenced_key_exist_in_block++; @@ -115,8 +129,7 @@ struct ColumnFamilyAccessInfoAggregate { class BlockCacheTraceAnalyzer { public: BlockCacheTraceAnalyzer( - const std::string& trace_file_path, - const std::string& output_miss_ratio_curve_path, + const std::string& trace_file_path, const std::string& output_dir, std::unique_ptr&& cache_simulator); ~BlockCacheTraceAnalyzer() = default; // No copy and move. @@ -165,7 +178,24 @@ class BlockCacheTraceAnalyzer { // accesses on keys exist in a data block and its break down by column family. void PrintDataBlockAccessStats() const; - void PrintMissRatioCurves() const; + // Write miss ratio curves of simulated cache configurations into a csv file + // saved in 'output_dir'. + void WriteMissRatioCurves() const; + + // Write the access timeline into a csv file saved in 'output_dir'. + void WriteAccessTimeline(const std::string& label) const; + + // Write the reuse distance into a csv file saved in 'output_dir'. Reuse + // distance is defined as the cumulated size of unique blocks read between two + // consective accesses on the same block. + void WriteReuseDistance(const std::string& label_str, + const std::set& distance_buckets) const; + + // Write the reuse interval into a csv file saved in 'output_dir'. Reuse + // interval is defined as the time between two consecutive accesses on the + // same block.. + void WriteReuseInterval(const std::string& label_str, + const std::set& time_buckets) const; const std::map& TEST_cf_aggregates_map() const { @@ -173,15 +203,33 @@ class BlockCacheTraceAnalyzer { } private: + std::set ParseLabelStr(const std::string& label_str) const; + + std::string BuildLabel(const std::set& labels, + const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + TableReaderCaller caller, + const std::string& block_key) const; + + void ComputeReuseDistance(BlockAccessInfo* info) const; + void RecordAccess(const BlockCacheTraceRecord& access); + void UpdateReuseIntervalStats( + const std::string& label, const std::set& time_buckets, + const std::map timeline, + std::map>* + label_time_num_reuses, + uint64_t* total_num_reuses) const; + rocksdb::Env* env_; const std::string trace_file_path_; - const std::string output_miss_ratio_curve_path_; + const std::string output_dir_; BlockCacheTraceHeader header_; std::unique_ptr cache_simulator_; std::map cf_aggregates_map_; + std::map block_info_map_; }; int block_cache_trace_analyzer_tool(int argc, char** argv); diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index c361ba054ac..80734565a3d 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -49,7 +49,13 @@ class BlockCacheTracerTest : public testing::Test { EXPECT_OK(env_->CreateDir(test_path_)); trace_file_path_ = test_path_ + "/block_cache_trace"; block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config"; - output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve"; + timeline_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_buckets_ = "1,1K,1M,1G"; + reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; + reuse_interval_buckets_ = "1,10,100,1000"; } ~BlockCacheTracerTest() override { @@ -85,11 +91,12 @@ class BlockCacheTracerTest : public testing::Test { assert(writer); for (uint32_t i = 0; i < nblocks; i++) { uint32_t key_id = from_key_id + i; + uint32_t timestamp = (key_id + 1) * kMicrosInSecond; BlockCacheTraceRecord record; record.block_type = block_type; record.block_size = kBlockSize + key_id; record.block_key = kBlockKeyPrefix + std::to_string(key_id); - record.access_timestamp = env_->NowMicros(); + record.access_timestamp = timestamp; record.cf_id = kCFId; record.cf_name = kDefaultColumnFamilyName; record.caller = GetCaller(key_id); @@ -146,11 +153,17 @@ class BlockCacheTracerTest : public testing::Test { "./block_cache_trace_analyzer", "-block_cache_trace_path=" + trace_file_path_, "-block_cache_sim_config_path=" + block_cache_sim_config_path_, - "-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_, + "-block_cache_analysis_result_dir=" + test_path_, "-print_block_size_stats", "-print_access_count_stats", "-print_data_block_access_count_stats", - "-cache_sim_warmup_seconds=0"}; + "-cache_sim_warmup_seconds=0", + "-timeline_labels=" + timeline_labels_, + "-reuse_distance_labels=" + reuse_distance_labels_, + "-reuse_distance_buckets=" + reuse_distance_buckets_, + "-reuse_interval_labels=" + reuse_interval_labels_, + "-reuse_interval_buckets=" + reuse_interval_buckets_, + }; char arg_buffer[kArgBufferSize]; char* argv[kMaxArgCount]; int argc = 0; @@ -168,10 +181,14 @@ class BlockCacheTracerTest : public testing::Test { Env* env_; EnvOptions env_options_; - std::string output_miss_ratio_curve_path_; std::string block_cache_sim_config_path_; std::string trace_file_path_; std::string test_path_; + std::string timeline_labels_; + std::string reuse_distance_labels_; + std::string reuse_distance_buckets_; + std::string reuse_interval_labels_; + std::string reuse_interval_buckets_; }; TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { @@ -199,7 +216,8 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { // Validate the cache miss ratios. const std::vector expected_capacities{1024, 1024 * 1024, 1024 * 1024 * 1024}; - std::ifstream infile(output_miss_ratio_curve_path_); + const std::string mrc_path = test_path_ + "/mrc"; + std::ifstream infile(mrc_path); uint32_t config_index = 0; std::string line; // Read header. @@ -224,8 +242,91 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } ASSERT_EQ(expected_capacities.size(), config_index); infile.close(); + ASSERT_OK(env_->DeleteFile(mrc_path)); + } + { + // Validate the timeline csv files. + const uint32_t expected_num_lines = 50; + std::stringstream ss(timeline_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::string timeline_file = + test_path_ + "/" + l + "_access_timeline"; + std::ifstream infile(timeline_file); + std::string line; + uint32_t nlines = 0; + ASSERT_TRUE(getline(infile, line)); + uint64_t expected_time = 1; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + uint32_t naccesses = 0; + std::string substr; + uint32_t time = 0; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (time == 0) { + time = ParseUint32(substr); + continue; + } + naccesses += ParseUint32(substr); + } + nlines++; + ASSERT_EQ(1, naccesses); + ASSERT_EQ(expected_time, time); + expected_time += 1; + } + ASSERT_EQ(expected_num_lines, nlines); + ASSERT_OK(env_->DeleteFile(timeline_file)); + } + } + { + // Validate the reuse_interval and reuse_distance csv files. + std::map test_reuse_csv_files; + test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_; + for (auto const& test : test_reuse_csv_files) { + const std::string& file_suffix = test.first; + const std::string& labels = test.second; + const uint32_t expected_num_rows = 10; + const uint32_t expected_num_rows_absolute_values = 5; + const uint32_t expected_reused_blocks = 0; + std::stringstream ss(labels); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix; + std::ifstream infile(reuse_csv_file); + std::string line; + ASSERT_TRUE(getline(infile, line)); + uint32_t nblocks = 0; + double npercentage = 0; + uint32_t nrows = 0; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + bool label_read = false; + nrows++; + while (ss_naccess.good()) { + std::string substr; + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!label_read) { + label_read = true; + continue; + } + if (nrows < expected_num_rows_absolute_values) { + nblocks += ParseUint32(substr); + } else { + npercentage += ParseDouble(substr); + } + } + } + ASSERT_EQ(expected_num_rows, nrows); + ASSERT_EQ(expected_reused_blocks, nblocks); + ASSERT_LT(npercentage, 0); + ASSERT_OK(env_->DeleteFile(reuse_csv_file)); + } + } } - ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_)); ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_)); } From b4d72094280e1e0220ec321779902aba6662db25 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Mon, 24 Jun 2019 20:50:35 -0700 Subject: [PATCH 180/572] Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a --- HISTORY.md | 1 + db/db_iterator_test.cc | 142 ++++ include/rocksdb/table.h | 24 +- java/rocksjni/portal.h | 7 +- options/options_helper.cc | 4 +- table/block_based/block.cc | 77 ++- table/block_based/block.h | 103 +-- table/block_based/block_based_table_reader.cc | 653 ++++++++++-------- table/block_based/block_based_table_reader.h | 107 ++- table/block_based/block_test.cc | 253 +++---- .../block_based/data_block_hash_index_test.cc | 8 +- table/block_based/index_builder.cc | 10 +- table/block_based/index_builder.h | 48 +- table/block_based/partitioned_filter_block.cc | 25 +- table/block_fetcher.cc | 1 - table/format.cc | 52 ++ table/format.h | 29 + table/internal_iterator.h | 7 +- table/iterator.cc | 8 +- table/meta_blocks.cc | 17 +- table/table_test.cc | 319 ++++++++- table/two_level_iterator.cc | 26 +- table/two_level_iterator.h | 7 +- test_util/testutil.cc | 9 +- util/coding.h | 13 + 25 files changed, 1362 insertions(+), 588 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 975ece580d4..07eb2759736 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -41,6 +41,7 @@ * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. * When reading from option file/string/map, customized envs can be filled according to object registry. * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator. +* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index e2b9f503ffb..d514e7683de 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -1049,6 +1049,148 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { ASSERT_EQ(upper_bound_hits, 1); } } + +// Enable kBinarySearchWithFirstKey, do some iterator operations and check that +// they don't do unnecessary block reads. +TEST_P(DBIteratorTest, IndexWithFirstKey) { + for (int tailing = 0; tailing < 2; ++tailing) { + SCOPED_TRACE("tailing = " + std::to_string(tailing)); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a1", "x1")); + ASSERT_OK(Merge("b1", "y1")); + ASSERT_OK(Merge("c0", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a2", "x2")); + ASSERT_OK(Merge("b2", "y2")); + ASSERT_OK(Merge("c0", "z2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a3", "x3")); + ASSERT_OK(Merge("b3", "y3")); + ASSERT_OK(Merge("c3", "z3")); + ASSERT_OK(Flush()); + + // Block cache is not important for this test. + // We use BLOCK_CACHE_DATA_* counters just because they're the most readily + // available way of counting block accesses. + + ReadOptions ropt; + ropt.tailing = tailing; + std::unique_ptr iter(NewIterator(ropt)); + + iter->Seek("b10"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b3", iter->key().ToString()); + EXPECT_EQ("y3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Seek("c0"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c0", iter->key().ToString()); + EXPECT_EQ("z1,z2", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c3", iter->key().ToString()); + EXPECT_EQ("z3", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter.reset(); + + // Enable iterate_upper_bound and check that iterator is not trying to read + // blocks that are fully above upper bound. + std::string ub = "b3"; + Slice ub_slice(ub); + ropt.iterate_upper_bound = &ub_slice; + iter.reset(NewIterator(ropt)); + + iter->Seek("b2"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + } +} + +TEST_P(DBIteratorTest, IndexWithFirstKeyGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a", "x1")); + ASSERT_OK(Merge("c", "y1")); + ASSERT_OK(Merge("e", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("c", "y2")); + ASSERT_OK(Merge("e", "z2")); + ASSERT_OK(Flush()); + + // Get() between blocks shouldn't read any blocks. + ASSERT_EQ("NOT_FOUND", Get("b")); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Get() of an existing key shouldn't read any unnecessary blocks when there's + // only one key per block. + + ASSERT_EQ("y1,y2", Get("c")); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ("x1", Get("a")); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(std::vector({"NOT_FOUND", "z1,z2"}), + MultiGet({"b", "e"})); +} + // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 88fcc78ed8c..929239100a4 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -93,14 +93,32 @@ struct BlockBasedTableOptions { enum IndexType : char { // A space efficient index block that is optimized for // binary-search-based index. - kBinarySearch, + kBinarySearch = 0x00, // The hash index, if enabled, will do the hash lookup when // `Options.prefix_extractor` is provided. - kHashSearch, + kHashSearch = 0x01, // A two-level index implementation. Both levels are binary search indexes. - kTwoLevelIndexSearch, + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + // + // IO errors are not handled correctly in this mode right now: if an error + // happens when lazily reading a block in value(), value() returns empty + // slice, and you need to call Valid()/status() afterwards. + // TODO(kolmike): Fix it. + kBinarySearchWithFirstKey = 0x03, }; IndexType index_type = kBinarySearch; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index d1585fcfa80..667af809bdc 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5902,8 +5902,10 @@ class IndexTypeJni { return 0x0; case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch: return 0x1; - case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: + case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: return 0x2; + case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey: + return 0x3; default: return 0x7F; // undefined } @@ -5920,6 +5922,9 @@ class IndexTypeJni { return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch; case 0x2: return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + case 0x3: + return rocksdb::BlockBasedTableOptions::IndexType:: + kBinarySearchWithFirstKey; default: // undefined/default return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch; diff --git a/options/options_helper.cc b/options/options_helper.cc index 71a7f9b2fc0..47aba7ad035 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -1671,7 +1671,9 @@ std::unordered_map {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, {"kTwoLevelIndexSearch", - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}}; + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, + {"kBinarySearchWithFirstKey", + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; std::unordered_map OptionsHelper::block_base_table_data_block_index_type_string_map = { diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 6c7e46d5969..8fa3ff9b986 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -608,8 +608,7 @@ bool IndexBlockIter::ParseNextIndexKey() { } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed - if (value_delta_encoded_) { - assert(value_length == 0); + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { DecodeCurrentValue(shared); } return true; @@ -627,24 +626,32 @@ bool IndexBlockIter::ParseNextIndexKey() { // Otherwise the format is delta-size = block handle size - size of last block // handle. void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { - assert(value_delta_encoded_); - const char* limit = data_ + restarts_; - if (shared == 0) { - uint64_t o, s; - const char* newp = GetVarint64Ptr(value_.data(), limit, &o); - assert(newp); - newp = GetVarint64Ptr(newp, limit, &s); - assert(newp); - decoded_value_ = BlockHandle(o, s); - value_ = Slice(value_.data(), newp - value_.data()); - } else { - uint64_t next_value_base = - decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; - int64_t delta; - const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); - decoded_value_ = - BlockHandle(next_value_base, decoded_value_.size() + delta); - value_ = Slice(value_.data(), newp - value_.data()); + Slice v(value_.data(), data_ + restarts_ - value_.data()); + // Delta encoding is used if `shared` != 0. + Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( + &v, have_first_key_, + (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + assert(decode_s.ok()); + value_ = Slice(value_.data(), v.data() - value_.data()); + + if (global_seqno_state_ != nullptr) { + // Overwrite sequence number the same way as in DataBlockIter. + + IterKey& first_internal_key = global_seqno_state_->first_internal_key; + first_internal_key.SetInternalKey(decoded_value_.first_internal_key, + /* copy */ true); + + assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(first_internal_key.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, + value_type); + decoded_value_.first_internal_key = first_internal_key.GetKey(); } } @@ -875,14 +882,10 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, } } -template <> -DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, - DataBlockIter* iter, Statistics* stats, - bool /*total_order_seek*/, - bool /*key_includes_seq*/, - bool /*value_is_full*/, - bool block_contents_pinned, - BlockPrefixIndex* /*prefix_index*/) { +DataBlockIter* Block::NewDataIterator(const Comparator* cmp, + const Comparator* ucmp, + DataBlockIter* iter, Statistics* stats, + bool block_contents_pinned) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -913,13 +916,11 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, return ret_iter; } -template <> -IndexBlockIter* Block::NewIterator(const Comparator* cmp, - const Comparator* ucmp, IndexBlockIter* iter, - Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, bool value_is_full, - bool block_contents_pinned, - BlockPrefixIndex* prefix_index) { +IndexBlockIter* Block::NewIndexIterator( + const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -938,9 +939,9 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, value_is_full, - block_contents_pinned, - nullptr /* data_block_hash_index */); + global_seqno_, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, + block_contents_pinned); } return ret_iter; diff --git a/table/block_based/block.h b/table/block_based/block.h index 2bb577d33bd..3af92b6a262 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -165,17 +165,7 @@ class Block { // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // - // key_includes_seq, default true, means that the keys are in internal key - // format. - // value_is_full, default true, means that no delta encoding is - // applied to values. - // - // NewIterator - // Same as above but also updates read_amp_bitmap_ if it is not nullptr. - // - // NewIterator - // If `prefix_index` is not nullptr this block will do hash lookup for the key - // prefix. If total_order_seek is true, prefix_index_ is ignored. + // Updates read_amp_bitmap_ if it is not nullptr. // // If `block_contents_pinned` is true, the caller will guarantee that when // the cleanup functions are transferred from the iterator to other @@ -188,13 +178,32 @@ class Block { // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. - template - TBlockIter* NewIterator( - const Comparator* comparator, const Comparator* user_comparator, - TBlockIter* iter = nullptr, Statistics* stats = nullptr, - bool total_order_seek = true, bool key_includes_seq = true, - bool value_is_full = true, bool block_contents_pinned = false, - BlockPrefixIndex* prefix_index = nullptr); + + DataBlockIter* NewDataIterator(const Comparator* comparator, + const Comparator* user_comparator, + DataBlockIter* iter = nullptr, + Statistics* stats = nullptr, + bool block_contents_pinned = false); + + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // `have_first_key` controls whether IndexValue will contain + // first_internal_key. It affects data serialization format, so the same value + // have_first_key must be used when writing and reading index. + // It is determined by IndexType property of the table. + IndexBlockIter* NewIndexIterator(const Comparator* comparator, + const Comparator* user_comparator, + IndexBlockIter* iter, Statistics* stats, + bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -471,7 +480,7 @@ class DataBlockIter final : public BlockIter { bool SeekForGetImpl(const Slice& target); }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -483,23 +492,12 @@ class IndexBlockIter final : public BlockIter { // format. // value_is_full, default true, means that no delta encoding is // applied to values. - IndexBlockIter(const Comparator* comparator, - const Comparator* user_comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) - : IndexBlockIter() { - Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned, - value_is_full, nullptr /* data_block_hash_index */); - } - void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned, - DataBlockHashIndex* /*data_block_hash_index*/) { + SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, + bool have_first_key, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned) { InitializeBase(key_includes_seq ? comparator : user_comparator, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned); @@ -507,6 +505,12 @@ class IndexBlockIter final : public BlockIter { key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; + have_first_key_ = have_first_key; + if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { + global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); + } else { + global_seqno_state_.reset(); + } } Slice user_key() const override { @@ -516,16 +520,17 @@ class IndexBlockIter final : public BlockIter { return key(); } - virtual BlockHandle value() const override { + virtual IndexValue value() const override { assert(Valid()); - if (value_delta_encoded_) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { return decoded_value_; } else { - BlockHandle handle; + IndexValue entry; Slice v = value_; - Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + Status decode_s __attribute__((__unused__)) = + entry.DecodeFrom(&v, have_first_key_, nullptr); assert(decode_s.ok()); - return handle; + return entry; } } @@ -552,10 +557,15 @@ class IndexBlockIter final : public BlockIter { void Invalidate(Status s) { InvalidateBase(s); } + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + private: // Key is in InternalKey format bool key_includes_seq_; bool value_delta_encoded_; + bool have_first_key_; // value includes first_internal_key BlockPrefixIndex* prefix_index_; // Whether the value is delta encoded. In that case the value is assumed to be // BlockHandle. The first value in each restart interval is the full encoded @@ -563,7 +573,22 @@ class IndexBlockIter final : public BlockIter { // offset of delta encoded BlockHandles is computed by adding the size of // previous delta encoded values in the same restart interval to the offset of // the first value in that restart interval. - BlockHandle decoded_value_; + IndexValue decoded_value_; + + // When sequence number overwriting is enabled, this struct contains the seqno + // to overwrite with, and current first_internal_key with overwritten seqno. + // This is rarely used, so we put it behind a pointer and only allocate when + // needed. + struct GlobalSeqnoState { + // First internal key according to current index entry, but with sequence + // number overwritten to global_seqno. + IterKey first_internal_key; + SequenceNumber global_seqno; + + explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} + }; + + std::unique_ptr global_seqno_state_; bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 5b2f515006f..5344625ec94 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -191,24 +191,22 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { return &table_->get_rep()->internal_comparator; } - bool index_key_includes_seq() const { + bool index_has_first_key() const { assert(table_ != nullptr); assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } - const TableProperties* const properties = - table_->get_rep()->table_properties.get(); - - return properties == nullptr || !properties->index_key_is_user_key; + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; } bool index_value_is_full() const { assert(table_ != nullptr); assert(table_->get_rep() != nullptr); - - const TableProperties* const properties = - table_->get_rep()->table_properties.get(); - - return properties == nullptr || !properties->index_value_is_delta_encoded; + return table_->get_rep()->index_value_is_full; } Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, @@ -305,7 +303,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } // return a two-level iterator: first level is on the partition index - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -319,10 +317,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } - InternalIteratorBase* it = nullptr; + InternalIteratorBase* it = nullptr; Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index @@ -330,26 +328,24 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. it = NewTwoLevelIterator( - new BlockBasedTable::PartitionedIndexIteratorState( - table(), &partition_map_, index_key_includes_seq(), - index_value_is_full()), - index_block.GetValue()->NewIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq(), - index_value_is_full())); + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full())); } else { ReadOptions ro; ro.fill_cache = read_options.fill_cache; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - it = new BlockBasedTableIterator( + it = new BlockBasedTableIterator( table(), ro, *internal_comparator(), - index_block.GetValue()->NewIterator( + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq(), - index_value_is_full()), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full()), false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, - index_key_includes_seq(), index_value_is_full(), lookup_context ? lookup_context->caller : TableReaderCaller::kUncategorized); } @@ -368,7 +364,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { void CacheDependencies(bool pin) override { // Before read partitions, prefetch them to avoid lots of IOs BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - auto rep = table()->rep_; + const BlockBasedTable::Rep* rep = table()->rep_; IndexBlockIter biter; BlockHandle handle; Statistics* kNullStats = nullptr; @@ -386,9 +382,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - index_block.GetValue()->NewIterator( + index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), &biter, - kNullStats, true, index_key_includes_seq(), index_value_is_full()); + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -396,7 +393,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // Empty index. return; } - handle = biter.value(); + handle = biter.value().handle; uint64_t prefetch_off = handle.offset(); // Read the last block's offset @@ -405,7 +402,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // Empty index. return; } - handle = biter.value(); + handle = biter.value().handle; uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; @@ -418,7 +415,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { biter.SeekToFirst(); auto ro = ReadOptions(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; CachableEntry block; // TODO: Support counter batch update for partitioned index and // filter blocks @@ -493,7 +490,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { return Status::OK(); } - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool /* disable_prefix_seek */, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -507,15 +504,16 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } Statistics* kNullStats = nullptr; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIterator( + auto it = index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, true, index_key_includes_seq(), index_value_is_full()); + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); assert(it != nullptr); index_block.TransferTo(it); @@ -552,7 +550,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { assert(index_reader != nullptr); assert(!pin || prefetch); - auto rep = table->get_rep(); + const BlockBasedTable::Rep* rep = table->get_rep(); assert(rep != nullptr); CachableEntry index_block; @@ -636,7 +634,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { return Status::OK(); } - InternalIteratorBase* NewIterator( + InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override { @@ -650,7 +648,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { return iter; } - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } Statistics* kNullStats = nullptr; @@ -658,11 +656,11 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { read_options.total_order_seek || disable_prefix_seek; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIterator( + auto it = index_block.GetValue()->NewIndexIterator( internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, total_order_seek, index_key_includes_seq(), - index_value_is_full(), false /* block_contents_pinned */, - prefix_index_.get()); + kNullStats, total_order_seek, index_has_first_key(), + index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, prefix_index_.get()); assert(it != nullptr); index_block.TransferTo(it); @@ -1083,7 +1081,6 @@ Status BlockBasedTable::Open( immortal_table); rep->file = std::move(file); rep->footer = footer; - rep->index_type = table_options.index_type; rep->hash_index_allow_collision = table_options.hash_index_allow_collision; // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. @@ -1113,6 +1110,8 @@ Status BlockBasedTable::Open( return s; } + // Populates table_properties and some fields that depend on it, + // such as index_type. s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(), largest_seqno); if (!s.ok()) { @@ -1317,6 +1316,24 @@ Status BlockBasedTable::ReadPropertiesBlock( BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.info_log); + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Update index_type with the true type. + // If table properties don't contain index type, we assume that the table + // is in very old format and has kBinarySearch index type. + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + rep_->index_type = static_cast( + DecodeFixed32(pos->second.c_str())); + } + + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, &(rep_->global_seqno)); if (!s.ok()) { @@ -1344,7 +1361,6 @@ Status BlockBasedTable::ReadRangeDelBlock( std::unique_ptr iter(NewDataBlockIterator( read_options, range_del_handle, /*input_iter=*/nullptr, BlockType::kRangeDeletion, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer)); assert(iter != nullptr); s = iter->status(); @@ -1436,7 +1452,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( &rep_->compression_dict_handle); } - BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); + BlockBasedTableOptions::IndexType index_type = rep_->index_type; const bool use_cache = table_options.cache_index_and_filter_blocks; @@ -1602,8 +1618,8 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, *meta_block = std::move(meta); // meta block uses bytewise comparator. - iter->reset(meta_block->get()->NewIterator( - BytewiseComparator(), BytewiseComparator())); + iter->reset(meta_block->get()->NewDataIterator(BytewiseComparator(), + BytewiseComparator())); return Status::OK(); } @@ -1846,10 +1862,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter( rep->prefix_filtering ? prefix_extractor : nullptr, rep->whole_key_filtering, std::move(block), nullptr, rep->ioptions.statistics, rep->internal_comparator, this, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0); + rep_->index_key_includes_seq, rep_->index_value_is_full); } case Rep::FilterType::kBlockFilter: @@ -2055,7 +2068,7 @@ CachableEntry BlockBasedTable::GetUncompressionDict( // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIteratorBase* BlockBasedTable::NewIndexIterator( +InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { @@ -2076,8 +2089,8 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( template TBlockIter* BlockBasedTable::NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, - BlockType block_type, bool key_includes_seq, bool index_key_is_full, - GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -2106,7 +2119,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } assert(block.GetValue() != nullptr); - constexpr bool kTotalOrderSeek = true; + // Block contents are pinned and it is still pinned after the iterator // is destroyed as long as cleanup functions are moved to another object, // when: @@ -2117,10 +2130,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool block_contents_pinned = block.IsCached() || (!block.GetValue()->own_bytes() && rep_->immortal_table); - iter = block.GetValue()->NewIterator( - &rep_->internal_comparator, rep_->internal_comparator.user_comparator(), - iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq, - index_key_is_full, block_contents_pinned); + iter = InitBlockIterator(rep_, block.GetValue(), iter, + block_contents_pinned); if (!block.IsCached()) { if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { @@ -2162,6 +2173,26 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( return iter; } +template <> +DataBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, DataBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewDataIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, block_contents_pinned); +} + +template <> +IndexBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, IndexBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, /* total_order_seek */ true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full, block_contents_pinned); +} + Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, @@ -2360,14 +2391,10 @@ Status BlockBasedTable::RetrieveBlock( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, - std::unordered_map>* block_map, - bool index_key_includes_seq, bool index_key_is_full) - : table_(table), - block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq), - index_key_is_full_(index_key_is_full) {} - -InternalIteratorBase* + std::unordered_map>* block_map) + : table_(table), block_map_(block_map) {} + +InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( const BlockHandle& handle) { // Return a block iterator on the index partition @@ -2375,15 +2402,16 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( // This is a possible scenario since block cache might not have had space // for the partition if (block != block_map_->end()) { - auto rep = table_->get_rep(); + const Rep* rep = table_->get_rep(); assert(rep); Statistics* kNullStats = nullptr; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. - return block->second.GetValue()->NewIterator( + return block->second.GetValue()->NewIndexIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); + nullptr, kNullStats, true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full); } // Create an empty iterator return new IndexBlockIter(); @@ -2459,10 +2487,10 @@ bool BlockBasedTable::PrefixMayMatch( // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - std::unique_ptr> iiter(NewIndexIterator( + std::unique_ptr> iiter(NewIndexIterator( no_io_read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, - /*need_upper_bound_check=*/nullptr, lookup_context)); + /*get_context=*/nullptr, lookup_context)); iiter->Seek(internal_prefix); if (!iiter->Valid()) { @@ -2471,10 +2499,8 @@ bool BlockBasedTable::PrefixMayMatch( // and we're not really sure that we're past the end // of the file may_match = iiter->status().IsIncomplete(); - } else if ((rep_->table_properties && - rep_->table_properties->index_key_is_user_key - ? iiter->key() - : ExtractUserKey(iiter->key())) + } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key()) + : iiter->key()) .starts_with(ExtractUserKey(internal_prefix))) { // we need to check for this subtle case because our only // guarantee is that "the key is a string >= last key in that data @@ -2493,7 +2519,7 @@ bool BlockBasedTable::PrefixMayMatch( // after the data block corresponding to iiter->key() cannot // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. - BlockHandle handle = iiter->value(); + BlockHandle handle = iiter->value().handle; may_match = filter->PrefixMayMatch( prefix, prefix_extractor, handle.offset(), /*no_io=*/false, /*const_key_ptr=*/nullptr, lookup_context); @@ -2514,8 +2540,20 @@ bool BlockBasedTable::PrefixMayMatch( template void BlockBasedTableIterator::Seek(const Slice& target) { + SeekImpl(&target); +} + +template +void BlockBasedTableIterator::SeekToFirst() { + SeekImpl(nullptr); +} + +template +void BlockBasedTableIterator::SeekImpl( + const Slice* target) { is_out_of_bound_ = false; - if (!CheckPrefixMayMatch(target)) { + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target)) { ResetDataIter(); return; } @@ -2523,47 +2561,82 @@ void BlockBasedTableIterator::Seek(const Slice& target) { bool need_seek_index = true; if (block_iter_points_to_real_block_ && block_iter_.Valid()) { // Reseek. - prev_index_value_ = index_iter_->value(); - // We can avoid an index seek if: - // 1. The new seek key is larger than the current key - // 2. The new seek key is within the upper bound of the block - // Since we don't necessarily know the internal key for either - // the current key or the upper bound, we check user keys and - // exclude the equality case. Considering internal keys can - // improve for the boundary cases, but it would complicate the - // code. - if (user_comparator_.Compare(ExtractUserKey(target), - block_iter_.user_key()) > 0 && - user_comparator_.Compare(ExtractUserKey(target), - index_iter_->user_key()) < 0) { - need_seek_index = false; + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } } } if (need_seek_index) { - index_iter_->Seek(target); + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + if (!index_iter_->Valid()) { ResetDataIter(); return; } - InitDataBlock(); } - block_iter_.Seek(target); + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } - FindKeyForward(); CheckOutOfBound(); - assert( - !block_iter_.Valid() || - (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || - (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target), - block_iter_.key()) <= 0)); + + if (target) { + assert( + !Valid() || + ((block_type_ == BlockType::kIndex && + !table_->get_rep()->index_key_includes_seq) + ? (user_comparator_.Compare(ExtractUserKey(*target), key()) <= 0) + : (icomp_.Compare(*target, key()) <= 0))); + } } template void BlockBasedTableIterator::SeekForPrev( const Slice& target) { is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; @@ -2587,10 +2660,14 @@ void BlockBasedTableIterator::SeekForPrev( index_iter_->Seek(target); if (!index_iter_->Valid()) { + if (!index_iter_->status().ok()) { + ResetDataIter(); + return; + } + index_iter_->SeekToLast(); if (!index_iter_->Valid()) { ResetDataIter(); - block_iter_points_to_real_block_ = false; return; } } @@ -2604,24 +2681,10 @@ void BlockBasedTableIterator::SeekForPrev( icomp_.Compare(target, block_iter_.key()) >= 0); } -template -void BlockBasedTableIterator::SeekToFirst() { - is_out_of_bound_ = false; - SavePrevIndexValue(); - index_iter_->SeekToFirst(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToFirst(); - FindKeyForward(); - CheckOutOfBound(); -} - template void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); if (!index_iter_->Valid()) { @@ -2635,9 +2698,13 @@ void BlockBasedTableIterator::SeekToLast() { template void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); + CheckOutOfBound(); } template @@ -2653,8 +2720,21 @@ bool BlockBasedTableIterator::NextAndGetResult( template void BlockBasedTableIterator::Prev() { - assert(block_iter_points_to_real_block_); - block_iter_.Prev(); + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + FindKeyBackward(); } @@ -2667,9 +2747,9 @@ const size_t template void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value(); + BlockHandle data_block_handle = index_iter_->value().handle; if (!block_iter_points_to_real_block_ || - data_block_handle.offset() != prev_index_value_.offset() || + data_block_handle.offset() != prev_block_offset_ || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { @@ -2728,7 +2808,6 @@ void BlockBasedTableIterator::InitDataBlock() { Status s; table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, block_type_, - key_includes_seq_, index_key_is_full_, /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), /*for_compaction=*/lookup_context_.caller == TableReaderCaller::kCompaction); @@ -2736,6 +2815,47 @@ void BlockBasedTableIterator::InitDataBlock() { } } +template +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + // Uh oh. + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +template +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + template void BlockBasedTableIterator::FindBlockForward() { // TODO the while loop inherits from two-level-iterator. We don't know @@ -2766,22 +2886,23 @@ void BlockBasedTableIterator::FindBlockForward() { return; } - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToFirst(); - } else { + if (!index_iter_->Valid()) { return; } - } while (!block_iter_.Valid()); -} -template -void BlockBasedTableIterator::FindKeyForward() { - assert(!is_out_of_bound_); + IndexValue v = index_iter_->value(); - if (!block_iter_.Valid()) { - FindBlockForward(); - } + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); } template @@ -2808,8 +2929,7 @@ void BlockBasedTableIterator::FindKeyBackward() { template void BlockBasedTableIterator::CheckOutOfBound() { - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && block_iter_.Valid()) { + if (read_options_.iterate_upper_bound != nullptr && Valid()) { is_out_of_bound_ = user_comparator_.Compare( *read_options_.iterate_upper_bound, user_key()) <= 0; } @@ -2832,8 +2952,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, - compaction_readahead_size); + caller, compaction_readahead_size); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); @@ -2845,7 +2964,7 @@ InternalIterator* BlockBasedTable::NewIterator( !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size); + caller, compaction_readahead_size); } } @@ -2961,7 +3080,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, get_context, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -2971,12 +3090,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - BlockHandle handle = iiter->value(); + IndexValue v = iiter->value(); bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), - prefix_extractor, handle.offset(), no_io, + prefix_extractor, v.handle.offset(), no_io, /*const_ikey_ptr=*/nullptr, &lookup_context); if (not_exist_in_filter) { @@ -2986,78 +3105,85 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; - } else { - BlockCacheLookupContext lookup_data_block_context{ - TableReaderCaller::kUserGet}; - bool does_referenced_key_exist = false; - DataBlockIter biter; - uint64_t referenced_data_size = 0; - NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, - /*key_includes_seq=*/true, - /*index_key_is_full=*/true, get_context, &lookup_data_block_context, - /*s=*/Status(), /*prefetch_buffer*/ nullptr); + } - if (read_options.read_tier == kBlockCacheTier && - biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for - // whether we can guarantee the key is not there when "no_io" is set - get_context->MarkKeyMayExist(); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } - bool may_exist = biter.SeekForGet(key); - // If user-specified timestamp is supported, we cannot end the search - // just because hash index lookup indicates the key+ts does not exist. - if (!may_exist && ts_sz == 0) { - // HashSeek cannot find the key this block and the the iter is not - // the end of the block, i.e. cannot be in the following blocks - // either. In this case, the seek_key cannot be found, so we break - // from the top level for-loop. - done = true; - } else { - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + NewDataBlockIterator( + read_options, v.handle, &biter, BlockType::kData, + get_context, &lookup_data_block_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - does_referenced_key_exist = true; - referenced_data_size = biter.key().size() + biter.value().size(); - done = true; - break; - } + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; } - s = biter.status(); - } - // Write the block cache access record. - if (block_cache_tracer_) { - // Avoid making copy of block_key, cf_name, and referenced_key when - // constructing the access record. - BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), - /*block_key=*/"", lookup_data_block_context.block_type, - lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_data_block_context.caller, - lookup_data_block_context.is_cache_hit, - lookup_data_block_context.no_insert, - /*referenced_key=*/"", referenced_data_size, - lookup_data_block_context.num_keys_in_block, - does_referenced_key_exist); - block_cache_tracer_->WriteBlockAccess( - access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), key); } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } if (done) { @@ -3115,7 +3241,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, sst_file_range.begin()->get_context, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -3130,21 +3256,30 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, bool matched = false; // if such user key matched a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + bool reusing_block = true; uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( TableReaderCaller::kUserMultiGet); - if (iiter->value().offset() != offset) { - offset = iiter->value().offset(); + if (iiter->value().handle.offset() != offset) { + offset = iiter->value().handle.offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, iiter->value(), &biter, BlockType::kData, - /*key_includes_seq=*/false, - /*index_key_is_full=*/true, get_context, - &lookup_data_block_context, Status(), nullptr); + read_options, v.handle, &biter, BlockType::kData, + get_context, &lookup_data_block_context, Status(), nullptr); reusing_block = false; } + if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { // couldn't get block from block_cache @@ -3238,7 +3373,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, Status BlockBasedTable::Prefetch(const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; - auto user_comparator = comparator.user_comparator(); + UserComparatorWrapper user_comparator(comparator.user_comparator()); // pre-condition if (begin && end && comparator.Compare(*begin, *end) > 0) { return Status::InvalidArgument(*begin, *end); @@ -3248,10 +3383,9 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); + iiter_unique_ptr = std::unique_ptr>(iiter); } if (!iiter->status().ok()) { @@ -3264,13 +3398,12 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { - BlockHandle block_handle = iiter->value(); - const bool is_user_key = rep_->table_properties && - rep_->table_properties->index_key_is_user_key > 0; + BlockHandle block_handle = iiter->value().handle; + const bool is_user_key = !rep_->index_key_includes_seq; if (end && ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || (is_user_key && - user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { if (prefetching_boundary_page) { break; } @@ -3285,7 +3418,6 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, NewDataBlockIterator( ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, /*get_context=*/nullptr, &lookup_context, Status(), /*prefetch_buffer=*/nullptr); @@ -3315,13 +3447,12 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { // Check Data blocks IndexBlockIter iiter_on_stack; BlockCacheLookupContext context{caller}; - InternalIteratorBase* iiter = NewIndexIterator( + InternalIteratorBase* iiter = NewIndexIterator( ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &context); - std::unique_ptr> iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); + iiter_unique_ptr = std::unique_ptr>(iiter); } if (!iiter->status().ok()) { // error opening index iterator @@ -3332,14 +3463,14 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { } Status BlockBasedTable::VerifyChecksumInBlocks( - InternalIteratorBase* index_iter) { + InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { break; } - BlockHandle handle = index_iter->value(); + BlockHandle handle = index_iter->value().handle; BlockContents contents; BlockFetcher block_fetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, @@ -3445,31 +3576,13 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr> iiter(NewIndexIterator( + std::unique_ptr> iiter(NewIndexIterator( options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); iiter->Seek(key); assert(iiter->Valid()); - return TEST_BlockInCache(iiter->value()); -} - -BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { - // Some old version of block-based tables don't have index type present in - // table properties. If that's the case we can safely use the kBinarySearch. - BlockBasedTableOptions::IndexType index_type_on_file = - BlockBasedTableOptions::kBinarySearch; - if (rep_->table_properties) { - auto& props = rep_->table_properties->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { - index_type_on_file = static_cast( - DecodeFixed32(pos->second.c_str())); - // update index_type with the true type - rep_->index_type = index_type_on_file; - } - } - return index_type_on_file; + return TEST_BlockInCache(iiter->value().handle); } // REQUIRES: The following fields of rep_ should have already been populated: @@ -3483,21 +3596,20 @@ Status BlockBasedTable::CreateIndexReader( InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, IndexReader** index_reader, BlockCacheLookupContext* lookup_context) { - auto index_type_on_file = rep_->index_type; - // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. // If prefix_extractor does not match prefix_extractor_name from table // properties, turn off Hash Index by setting total_order_seek to true - switch (index_type_on_file) { + switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, index_reader, lookup_context); } - case BlockBasedTableOptions::kBinarySearch: { + case BlockBasedTableOptions::kBinarySearch: + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, index_reader, lookup_context); @@ -3527,7 +3639,7 @@ Status BlockBasedTable::CreateIndexReader( } default: { std::string error_message = - "Unrecognized index type: " + ToString(index_type_on_file); + "Unrecognized index type: " + ToString(rep_->index_type); return Status::InvalidArgument(error_message.c_str()); } } @@ -3536,7 +3648,7 @@ Status BlockBasedTable::CreateIndexReader( uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) { BlockCacheLookupContext context(caller); - std::unique_ptr> index_iter( + std::unique_ptr> index_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/&context)); @@ -3544,7 +3656,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, index_iter->Seek(key); uint64_t result; if (index_iter->Valid()) { - BlockHandle handle = index_iter->value(); + BlockHandle handle = index_iter->value().handle; result = handle.offset(); } else { // key is past the last key in the file. If table_properties is not @@ -3574,7 +3686,7 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3595,9 +3707,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, - /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); @@ -3806,7 +3917,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3827,8 +3938,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { Slice key = blockhandles_iter->key(); Slice user_key; InternalKey ikey; - if (rep_->table_properties && - rep_->table_properties->index_key_is_user_key != 0) { + if (!rep_->index_key_includes_seq) { user_key = key; } else { ikey.DecodeFrom(key); @@ -3838,7 +3948,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append(" HEX "); out_file->Append(user_key.ToString(true).c_str()); out_file->Append(": "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append(blockhandles_iter->value() + .ToString(true, rep_->index_has_first_key) + .c_str()); out_file->Append("\n"); std::string str_key = user_key.ToString(); @@ -3857,7 +3969,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr> blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -3879,7 +3991,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { break; } - BlockHandle bh = blockhandles_iter->value(); + BlockHandle bh = blockhandles_iter->value().handle; uint64_t datablock_size = bh.size(); datablock_size_min = std::min(datablock_size_min, datablock_size); datablock_size_max = std::max(datablock_size_max, datablock_size); @@ -3888,15 +4000,14 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { out_file->Append("Data Block # "); out_file->Append(rocksdb::ToString(block_id)); out_file->Append(" @ "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str()); out_file->Append("\n"); out_file->Append("--------------------------------------\n"); std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr, - /*type=*/BlockType::kData, - /*key_includes_seq=*/true, /*index_key_is_full=*/true, + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), /*prefetch_buffer=*/nullptr)); s = datablock_iter->status(); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index b03e67128e2..9300fb36a70 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -43,7 +43,6 @@ namespace rocksdb { -class BlockHandle; class Cache; class FilterBlockReader; class BlockBasedFilterBlockReader; @@ -198,7 +197,7 @@ class BlockBasedTable : public TableReader { // wraps the passed iter. In the latter case the return value points // to a different object then iter, and the callee has the ownership of the // returned object. - virtual InternalIteratorBase* NewIterator( + virtual InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) = 0; @@ -230,8 +229,7 @@ class BlockBasedTable : public TableReader { template TBlockIter* NewDataBlockIterator( const ReadOptions& ro, const BlockHandle& block_handle, - TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, - bool index_key_is_full, GetContext* get_context, + TBlockIter* input_iter, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; @@ -259,6 +257,12 @@ class BlockBasedTable : public TableReader { BlockType block_type, GetContext* get_context) const; + // Either Block::NewDataIterator() or Block::NewIndexIterator(). + template + static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + TBlockIter* input_iter, + bool block_contents_pinned); + // If block cache enabled (compressed or uncompressed), looks for the block // identified by handle in (1) uncompressed cache, (2) compressed cache, and // then (3) file. If found, inserts into the cache(s) that were searched @@ -312,7 +316,7 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIteratorBase* NewIndexIterator( + InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check, IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; @@ -355,9 +359,6 @@ class BlockBasedTable : public TableReader { friend class TableCache; friend class BlockBasedTableBuilder; - // Figure the index type, update it in rep_, and also return it. - BlockBasedTableOptions::IndexType UpdateIndexType(); - // Create a index reader based on the index type stored in the table. // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter @@ -410,7 +411,7 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); - Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. virtual FilterBlockReader* ReadFilter( @@ -446,17 +447,14 @@ class BlockBasedTable::PartitionedIndexIteratorState public: PartitionedIndexIteratorState( const BlockBasedTable* table, - std::unordered_map>* block_map, - const bool index_key_includes_seq, const bool index_key_is_full); - InternalIteratorBase* NewSecondaryIterator( + std::unordered_map>* block_map); + InternalIteratorBase* NewSecondaryIterator( const BlockHandle& index_value) override; private: // Don't own table_ const BlockBasedTable* table_; std::unordered_map>* block_map_; - bool index_key_includes_seq_; - bool index_key_is_full_; }; // Stores all the properties associated with a BlockBasedTable. @@ -564,12 +562,16 @@ struct BlockBasedTable::Rep { // still work, just not as quickly. bool blocks_definitely_zstd_compressed = false; + // These describe how index is encoded. + bool index_has_first_key = false; + bool index_key_includes_seq = true; + bool index_value_is_full = true; + bool closed = false; const bool immortal_table; SequenceNumber get_global_seqno(BlockType block_type) const { return (block_type == BlockType::kFilter || - block_type == BlockType::kIndex || block_type == BlockType::kCompressionDictionary) ? kDisableGlobalSequenceNumber : global_seqno; @@ -602,11 +604,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { BlockBasedTableIterator(const BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIteratorBase* index_iter, + InternalIteratorBase* index_iter, bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, - BlockType block_type, bool key_includes_seq, - bool index_key_is_full, TableReaderCaller caller, + BlockType block_type, TableReaderCaller caller, size_t compaction_readahead_size = 0) : InternalIteratorBase(false), table_(table), @@ -620,8 +621,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { need_upper_bound_check_(need_upper_bound_check), prefix_extractor_(prefix_extractor), block_type_(block_type), - key_includes_seq_(key_includes_seq), - index_key_is_full_(index_key_is_full), lookup_context_(caller), compaction_readahead_size_(compaction_readahead_size) {} @@ -635,19 +634,38 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool NextAndGetResult(Slice* ret_key) override; void Prev() override; bool Valid() const override { - return !is_out_of_bound_ && block_iter_points_to_real_block_ && - block_iter_.Valid(); + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); } Slice key() const override { assert(Valid()); - return block_iter_.key(); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } } Slice user_key() const override { assert(Valid()); - return block_iter_.user_key(); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } } TValue value() const override { assert(Valid()); + + // Load current block if not loaded. + if (is_at_first_key_from_index_ && + !const_cast(this) + ->MaterializeCurrentBlock()) { + // Oops, index is not consistent with block contents, but we have + // no good way to report error at this point. Let's return empty value. + return TValue(); + } + return block_iter_.value(); } Status status() const override { @@ -667,10 +685,17 @@ class BlockBasedTableIterator : public InternalIteratorBase { pinned_iters_mgr_ = pinned_iters_mgr; } bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - block_iter_points_to_real_block_ && block_iter_.IsKeyPinned(); + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); } bool IsValuePinned() const override { + // Load current block if not loaded. + if (is_at_first_key_from_index_) { + const_cast(this)->MaterializeCurrentBlock(); + } // BlockIter::IsValuePinned() is always true. No need to check return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && block_iter_points_to_real_block_; @@ -704,35 +729,33 @@ class BlockBasedTableIterator : public InternalIteratorBase { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - prev_index_value_ = index_iter_->value(); + prev_block_offset_ = index_iter_->value().handle.offset(); } } - void InitDataBlock(); - inline void FindKeyForward(); - void FindBlockForward(); - void FindKeyBackward(); - void CheckOutOfBound(); - private: const BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; - InternalIteratorBase* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to value() will trigger loading the block. + bool is_at_first_key_from_index_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; BlockType block_type_; - // If the keys in the blocks over which we iterate include 8 byte sequence - bool key_includes_seq_; - bool index_key_is_full_; - BlockHandle prev_index_value_; + uint64_t prev_block_offset_; BlockCacheLookupContext lookup_context_; // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. @@ -748,6 +771,16 @@ class BlockBasedTableIterator : public InternalIteratorBase { size_t readahead_limit_ = 0; int64_t num_file_reads_ = 0; std::unique_ptr prefetch_buffer_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitDataBlock(); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); }; } // namespace rocksdb diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 2dab4627cb6..e0ca24bf482 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -69,37 +69,12 @@ void GenerateRandomKVs(std::vector *keys, } } -// Same as GenerateRandomKVs but the values are BlockHandle -void GenerateRandomKBHs(std::vector *keys, - std::vector *values, const int from, - const int len, const int step = 1, - const int padding_size = 0, - const int keys_share_prefix = 1) { - Random rnd(302); - uint64_t offset = 0; - - // generate different prefix - for (int i = from; i < from + len; i += step) { - // generate keys that shares the prefix - for (int j = 0; j < keys_share_prefix; ++j) { - keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); - - uint64_t size = rnd.Uniform(1024 * 16); - BlockHandle handle(offset, size); - offset += size + kBlockTrailerSize; - values->emplace_back(handle); - } - } -} - class BlockTest : public testing::Test {}; // block test TEST_F(BlockTest, SimpleTest) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -123,7 +98,7 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; InternalIterator *iter = - reader.NewIterator(options.comparator, options.comparator); + reader.NewDataIterator(options.comparator, options.comparator); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -136,8 +111,7 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = - reader.NewIterator(options.comparator, options.comparator); + iter = reader.NewDataIterator(options.comparator, options.comparator); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -152,83 +126,6 @@ TEST_F(BlockTest, SimpleTest) { delete iter; } -TEST_F(BlockTest, ValueDeltaEncodingTest) { - Random rnd(301); - Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); - - std::vector keys; - std::vector values; - const bool kUseDeltaEncoding = true; - const bool kUseValueDeltaEncoding = true; - BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); - int num_records = 100; - - GenerateRandomKBHs(&keys, &values, 0, num_records); - // add a bunch of records to a block - BlockHandle last_encoded_handle; - for (int i = 0; i < num_records; i++) { - auto block_handle = values[i]; - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle.size()); - last_encoded_handle = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); - } - - // read serialized contents of the block - Slice rawblock = builder.Finish(); - - // create block reader - BlockContents contents; - contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); - - const bool kTotalOrderSeek = true; - const bool kIncludesSeq = true; - const bool kValueIsFull = !kUseValueDeltaEncoding; - IndexBlockIter *kNullIter = nullptr; - Statistics *kNullStats = nullptr; - // read contents of block sequentially - int count = 0; - InternalIteratorBase *iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { - // read kv from block - Slice k = iter->key(); - BlockHandle handle = iter->value(); - - // compare with lookaside array - ASSERT_EQ(k.ToString().compare(keys[count]), 0); - - ASSERT_EQ(values[count].offset(), handle.offset()); - ASSERT_EQ(values[count].size(), handle.size()); - } - delete iter; - - // read block contents randomly - iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (int i = 0; i < num_records; i++) { - // find a random key in the lookaside array - int index = rnd.Uniform(num_records); - Slice k(keys[index]); - - // search in block for this key - iter->Seek(k); - ASSERT_TRUE(iter->Valid()); - BlockHandle handle = iter->value(); - ASSERT_EQ(values[index].offset(), handle.offset()); - ASSERT_EQ(values[index].size(), handle.size()); - } - delete iter; -} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, @@ -261,8 +158,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, NewFixedPrefixTransform(prefix_size)); std::unique_ptr regular_iter( - reader2.NewIterator(BytewiseComparator(), - BytewiseComparator())); + reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator())); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -457,8 +353,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) { TEST_F(BlockTest, BlockWithReadAmpBitmap) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -486,9 +380,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -519,9 +412,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -555,9 +447,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); @@ -602,6 +493,132 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) { ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32); } +class IndexBlockTest + : public testing::Test, + public testing::WithParamInterface> { + public: + IndexBlockTest() = default; + + bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); } + bool includeFirstKey() const { return std::get<1>(GetParam()); } +}; + +// Similar to GenerateRandomKVs but for index block contents. +void GenerateRandomIndexEntries(std::vector *separators, + std::vector *block_handles, + std::vector *first_keys, + const int len) { + Random rnd(42); + + // For each of `len` blocks, we need to generate a first and last key. + // Let's generate n*2 random keys, sort them, group into consecutive pairs. + std::set keys; + while ((int)keys.size() < len * 2) { + // Keys need to be at least 8 bytes long to look like internal keys. + keys.insert(test::RandomKey(&rnd, 12)); + } + + uint64_t offset = 0; + for (auto it = keys.begin(); it != keys.end();) { + first_keys->emplace_back(*it++); + separators->emplace_back(*it++); + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + block_handles->emplace_back(handle); + } +} + +TEST_P(IndexBlockTest, IndexValueEncodingTest) { + Random rnd(301); + Options options = Options(); + + std::vector separators; + std::vector block_handles; + std::vector first_keys; + const bool kUseDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding()); + int num_records = 100; + + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + num_records); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); + if (useValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, includeFirstKey(), + &last_encoded_handle); + } + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !useValueDeltaEncoding(); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + InternalIteratorBase *iter = reader.NewIndexIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + iter->SeekToFirst(); + for (int index = 0; index < num_records; ++index) { + ASSERT_TRUE(iter->Valid()); + + Slice k = iter->key(); + IndexValue v = iter->value(); + + EXPECT_EQ(separators[index], k.ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + + iter->Next(); + } + delete iter; + + // read block contents randomly + iter = reader.NewIndexIterator(options.comparator, options.comparator, + kNullIter, kNullStats, kTotalOrderSeek, + includeFirstKey(), kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records * 2; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(separators[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + IndexValue v = iter->value(); + EXPECT_EQ(separators[index], iter->key().ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + } + delete iter; +} + +INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + } // namespace rocksdb int main(int argc, char **argv) { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 5ec0938714f..484617d7e14 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) { Block reader(std::move(contents), kDisableGlobalSequenceNumber); const InternalKeyComparator icmp(BytewiseComparator()); - auto iter = reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); bool may_exist; // search in block for the key just inserted { @@ -474,8 +474,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // random seek existent keys for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "1" /* existing key marker */); @@ -512,8 +511,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // C true false for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "0" /* non-existing key marker */); diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index c1ce541ae56..f3a4b10e01e 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -36,7 +36,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, - table_opt.index_shortening); + table_opt.index_shortening, /* include_first_key */ false); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder( @@ -48,6 +48,12 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = PartitionedIndexBuilder::CreateIndexBuilder( comparator, use_value_delta_encoding, table_opt); } break; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ true); + } break; default: { assert(!"Do not recognize the index type "); } break; @@ -94,7 +100,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, - table_opt_.index_shortening); + table_opt_.index_shortening, /* include_first_key */ false); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index 6baa9891b1d..47348b31f78 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -58,6 +58,7 @@ class IndexBuilder { // To allow further optimization, we provide `last_key_in_current_block` and // `first_key_in_next_block`, based on which the specific implementation can // determine the best index key to be used for the index block. + // Called before the OnKeyAdded() call for first_key_in_next_block. // @last_key_in_current_block: this parameter maybe overridden with the value // "substitute key". // @first_key_in_next_block: it will be nullptr if the entry being added is @@ -123,7 +124,8 @@ class ShortenedIndexBuilder : public IndexBuilder { const InternalKeyComparator* comparator, const int index_block_restart_interval, const uint32_t format_version, const bool use_value_delta_encoding, - BlockBasedTableOptions::IndexShorteningMode shortening_mode) + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + bool include_first_key) : IndexBuilder(comparator), index_block_builder_(index_block_restart_interval, true /*use_delta_encoding*/, @@ -131,11 +133,19 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_without_seq_(index_block_restart_interval, true /*use_delta_encoding*/, use_value_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + include_first_key_(include_first_key), shortening_mode_(shortening_mode) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } + virtual void OnKeyAdded(const Slice& key) override { + if (include_first_key_ && current_block_first_internal_key_.empty()) { + current_block_first_internal_key_.assign(key.data(), key.size()); + } + } + virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { @@ -159,20 +169,27 @@ class ShortenedIndexBuilder : public IndexBuilder { } auto sep = Slice(*last_key_in_current_block); - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle_.size()); - assert(handle_delta_encoding.size() != 0); + assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + IndexValue entry(block_handle, current_block_first_internal_key_); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + entry.EncodeTo(&delta_encoded_entry, include_first_key_, + &last_encoded_handle_); + } else { + // If it's the first block, or delta encoding is disabled, + // BlockBuilder::Add() below won't use delta-encoded slice. + } last_encoded_handle_ = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_block_builder_.Add(sep, handle_encoding, - &handle_delta_encoding_slice); + const Slice delta_encoded_entry_slice(delta_encoded_entry); + index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, - &handle_delta_encoding_slice); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, + &delta_encoded_entry_slice); } + + current_block_first_internal_key_.clear(); } using IndexBuilder::Finish; @@ -200,9 +217,12 @@ class ShortenedIndexBuilder : public IndexBuilder { private: BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; + const bool use_value_delta_encoding_; bool seperator_is_key_plus_seq_; + const bool include_first_key_; BlockBasedTableOptions::IndexShorteningMode shortening_mode_; - BlockHandle last_encoded_handle_; + BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); + std::string current_block_first_internal_key_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -243,7 +263,7 @@ class HashIndexBuilder : public IndexBuilder { : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, format_version, use_value_delta_encoding, - shortening_mode), + shortening_mode, /* include_first_key */ false), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index cce6744157e..dcd985152bb 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -147,12 +147,13 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { IndexBlockIter biter; BlockHandle handle; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, table_->rep_->cache_key_prefix_size, handle, cache_key); @@ -221,15 +222,16 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { IndexBlockIter iter; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { return BlockHandle(0, 0); } assert(iter.Valid()); - BlockHandle fltr_blk_handle = iter.value(); + BlockHandle fltr_blk_handle = iter.value().handle; return fltr_blk_handle; } @@ -280,18 +282,19 @@ void PartitionedFilterBlockReader::CacheDependencies( BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter biter; Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( + idx_on_fltr_blk_->NewIndexIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); + /* have_first_key */ false, index_key_includes_seq_, + index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); - BlockHandle handle = biter.value(); + BlockHandle handle = biter.value().handle; uint64_t prefetch_off = handle.offset(); // Read the last block's offset biter.SeekToLast(); - handle = biter.value(); + handle = biter.value().handle; uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; @@ -304,7 +307,7 @@ void PartitionedFilterBlockReader::CacheDependencies( // After prefetch, read the partitions one by one biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - handle = biter.value(); + handle = biter.value().handle; const bool no_io = true; const bool is_a_filter_partition = true; auto filter = table_->GetFilter( diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 6fdddc37e49..81e1345d9c2 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -15,7 +15,6 @@ #include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" #include "rocksdb/env.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" diff --git a/table/format.cc b/table/format.cc index 2046903a703..b3eb281a2e5 100644 --- a/table/format.cc +++ b/table/format.cc @@ -91,6 +91,58 @@ std::string BlockHandle::ToString(bool hex) const { const BlockHandle BlockHandle::kNullBlockHandle(0, 0); +void IndexValue::EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const { + if (previous_handle) { + assert(handle.offset() == previous_handle->offset() + + previous_handle->size() + kBlockTrailerSize); + PutVarsignedint64(dst, handle.size() - previous_handle->size()); + } else { + handle.EncodeTo(dst); + } + assert(dst->size() != 0); + + if (have_first_key) { + PutLengthPrefixedSlice(dst, first_internal_key); + } +} + +Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle) { + if (previous_handle) { + int64_t delta; + if (!GetVarsignedint64(input, &delta)) { + return Status::Corruption("bad delta-encoded index value"); + } + handle = BlockHandle( + previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, + previous_handle->size() + delta); + } else { + Status s = handle.DecodeFrom(input); + if (!s.ok()) { + return s; + } + } + + if (!have_first_key) { + first_internal_key = Slice(); + } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { + return Status::Corruption("bad first key in block info"); + } + + return Status::OK(); +} + +std::string IndexValue::ToString(bool hex, bool have_first_key) const { + std::string s; + EncodeTo(&s, have_first_key, nullptr); + if (hex) { + return Slice(s).ToString(true); + } else { + return s; + } +} + namespace { inline bool IsLegacyFooterFormat(uint64_t magic_number) { return magic_number == kLegacyBlockBasedTableMagicNumber || diff --git a/table/format.h b/table/format.h index baad78070ca..539ca88805c 100644 --- a/table/format.h +++ b/table/format.h @@ -76,6 +76,35 @@ class BlockHandle { static const BlockHandle kNullBlockHandle; }; +// Value in block-based table file index. +// +// The index entry for block n is: y -> h, [x], +// where: y is some key between the last key of block n (inclusive) and the +// first key of block n+1 (exclusive); h is BlockHandle pointing to block n; +// x, if present, is the first key of block n (unshortened). +// This struct represents the "h, [x]" part. +struct IndexValue { + BlockHandle handle; + // Empty means unknown. + Slice first_internal_key; + + IndexValue() = default; + IndexValue(BlockHandle _handle, Slice _first_internal_key) + : handle(_handle), first_internal_key(_first_internal_key) {} + + // have_first_key indicates whether the `first_internal_key` is used. + // If previous_handle is not null, delta encoding is used; + // in this case, the two handles must point to consecutive blocks: + // handle.offset() == + // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize + void EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const; + Status DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle); + + std::string ToString(bool hex, bool have_first_key) const; +}; + inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, uint32_t version) { #ifdef NDEBUG diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8f1cc9dd68e..696e66135dc 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -90,8 +90,11 @@ class InternalIteratorBase : public Cleanable { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // True if the iterator is invalidated because it is out of the iterator - // upper bound + // True if the iterator is invalidated because it reached a key that is above + // the iterator upper bound. Used by LevelIterator to decide whether it should + // stop or move on to the next file. + // Important: if iterator reached the end of the file without encountering any + // keys above the upper bound, IsOutOfBound() must return false. virtual bool IsOutOfBound() { return false; } // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont diff --git a/table/iterator.cc b/table/iterator.cc index 97a0cef5e08..f6c7f9cec3f 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -167,7 +167,7 @@ template InternalIteratorBase* NewErrorInternalIterator(const Status& status) { return new EmptyInternalIterator(status); } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status); template InternalIteratorBase* NewErrorInternalIterator( const Status& status); @@ -182,7 +182,7 @@ InternalIteratorBase* NewErrorInternalIterator(const Status& status, return new (mem) EmptyInternalIterator(status); } } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); @@ -191,7 +191,7 @@ template InternalIteratorBase* NewEmptyInternalIterator() { return new EmptyInternalIterator(Status::OK()); } -template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); template InternalIteratorBase* NewEmptyInternalIterator(); template @@ -203,7 +203,7 @@ InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { return new (mem) EmptyInternalIterator(Status::OK()); } } -template InternalIteratorBase* NewEmptyInternalIterator( +template InternalIteratorBase* NewEmptyInternalIterator( Arena* arena); template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 4205d298b6d..3bbc6d87080 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -229,8 +229,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, Block properties_block(std::move(block_contents), kDisableGlobalSequenceNumber); DataBlockIter iter; - properties_block.NewIterator(BytewiseComparator(), - BytewiseComparator(), &iter); + properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(), + &iter); auto new_table_properties = new TableProperties(); // All pre-defined properties of type uint64_t @@ -386,9 +386,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, // are to compress it. Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); // -- Read property block bool found_properties_block = true; @@ -459,8 +458,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -504,8 +503,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); BlockHandle block_handle; status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); diff --git a/table/table_test.cc b/table/table_test.cc index 2e2286efae4..418ecf004b7 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -236,7 +236,7 @@ class BlockConstructor: public Constructor { } InternalIterator* NewIterator( const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewIterator(comparator_, comparator_); + return block_->NewDataIterator(comparator_, comparator_); } private: @@ -308,8 +308,9 @@ class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, bool convert_to_internal_key = false, - int level = -1) + int level = -1, SequenceNumber largest_seqno = 0) : Constructor(cmp), + largest_seqno_(largest_seqno), convert_to_internal_key_(convert_to_internal_key), level_(level) {} ~TableConstructor() override { Reset(); } @@ -326,6 +327,14 @@ class TableConstructor: public Constructor { std::unique_ptr builder; std::vector> int_tbl_prop_collector_factories; + + if (largest_seqno_ != 0) { + // Pretend that it's an external file written by SstFileWriter. + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + } + std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, internal_comparator, @@ -362,7 +371,7 @@ class TableConstructor: public Constructor { return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, - level_), + level_, largest_seqno_, nullptr), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -428,6 +437,7 @@ class TableConstructor: public Constructor { std::unique_ptr file_writer_; std::unique_ptr file_reader_; std::unique_ptr table_reader_; + SequenceNumber largest_seqno_; bool convert_to_internal_key_; int level_; @@ -1484,7 +1494,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - for (int i = 0; i < 4; ++i) { + for (int i = 0; i <= 5; ++i) { Options options; // Make each key/value an individual block table_options.block_size = 64; @@ -1515,11 +1525,16 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { options.prefix_extractor.reset(NewFixedPrefixTransform(4)); break; case 4: - default: - // Binary search index + // Two-level index table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; options.table_factory.reset(new BlockBasedTableFactory(table_options)); break; + case 5: + // Binary search with first key + table_options.index_type = + BlockBasedTableOptions::kBinarySearchWithFirstKey; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; } TableConstructor c(BytewiseComparator(), @@ -1663,10 +1678,10 @@ static std::string RandomString(Random* rnd, int len) { } void AddInternalKey(TableConstructor* c, const std::string& prefix, - int /*suffix_len*/ = 800) { + std::string value = "v", int /*suffix_len*/ = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); - c->Add(k.Encode().ToString(), "v"); + c->Add(k.Encode().ToString(), value); } void TableTest::IndexTest(BlockBasedTableOptions table_options) { @@ -1845,6 +1860,286 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { ASSERT_TRUE(iter->status().IsIncomplete()); } +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + IndexTest(table_options); +} + +class CustomFlushBlockPolicy : public FlushBlockPolicyFactory, + public FlushBlockPolicy { + public: + explicit CustomFlushBlockPolicy(std::vector keys_per_block) + : keys_per_block_(keys_per_block) {} + + const char* Name() const override { return "table_test"; } + FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&, + const BlockBuilder&) const override { + return new CustomFlushBlockPolicy(keys_per_block_); + } + + bool Update(const Slice&, const Slice&) override { + if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) { + ++current_block_idx_; + keys_in_current_block_ = 1; + return true; + } + + ++keys_in_current_block_; + return false; + } + + std::vector keys_per_block_; + + int current_block_idx_ = 0; + int keys_in_current_block_ = 0; +}; + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) { + for (int use_first_key = 0; use_first_key < 2; ++use_first_key) { + SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key)); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = + use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey + : BlockBasedTableOptions::kBinarySearch; + table_options.block_cache = NewLRUCache(10000); // fits all blocks + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(std::vector{2, 1, 3, 2}); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator()); + + // Block 0. + AddInternalKey(&c, "aaaa", "v0"); + AddInternalKey(&c, "aaac", "v1"); + + // Block 1. + AddInternalKey(&c, "aaca", "v2"); + + // Block 2. + AddInternalKey(&c, "caaa", "v3"); + AddInternalKey(&c, "caac", "v4"); + AddInternalKey(&c, "caae", "v5"); + + // Block 3. + AddInternalKey(&c, "ccaa", "v6"); + AddInternalKey(&c, "ccac", "v7"); + + // Write the file. + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(8, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(4u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Shouldn't have read data blocks before iterator is seeked. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + auto ikey = [](Slice user_key) { + return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + }; + + // Seek to a key between blocks. If index contains first key, we shouldn't + // read any data blocks until value is requested. + iter->Seek(ikey("aaba")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the middle of a block. The block should be read right away. + iter->Seek(ikey("caab")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to just before the same block and don't access value. + // The iterator should keep pinning the block contents. + iter->Seek(ikey("baaa")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the same block again to check that the block is still pinned. + iter->Seek(ikey("caae")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[5], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v5", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and fall through to the next block. Don't access value. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[6], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward again. Block should be read. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and reach the end. + iter->Next(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to a single-key block and step forward without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 1 : 2, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + // Seek between blocks and step back without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[1], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + // All blocks are in cache now, there'll be no more misses ever. + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v1", iter->value().ToString()); + + // Next into the next block again. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 4, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to first and step back without accessing value. + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[0], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Prev(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Do some SeekForPrev() and SeekToLast() just to cover all methods. + iter->SeekForPrev(ikey("caad")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + c.ResetTableReader(); + } +} + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + table_options.block_cache = NewLRUCache(10000); + Options options; + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false, + /* level */ -1, /* largest_seqno */ 42); + + c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x"); + c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y"); + + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(2, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(1u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString()); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + EXPECT_NE(keys[0], iter->key().ToString()); + // Key should have been served from index, without reading data blocks. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + EXPECT_EQ("x", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + + c.ResetTableReader(); +} + // It's very hard to figure out the index block size of a block accurately. // To make sure we get the index size, we just make sure as key number // grows, the filter block size also grows. @@ -3606,9 +3901,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); bool found_properties_block = true; ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); ASSERT_TRUE(found_properties_block); @@ -3688,8 +3982,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewIterator(options.comparator, - options.comparator)}; + metaindex_block.NewDataIterator(options.comparator, options.comparator)}; uint64_t max_offset = 0; std::string key_at_max_offset; for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 7ff73cd4e4f..1cb00b63928 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -19,11 +19,11 @@ namespace rocksdb { namespace { -class TwoLevelIndexIterator : public InternalIteratorBase { +class TwoLevelIndexIterator : public InternalIteratorBase { public: explicit TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); ~TwoLevelIndexIterator() override { first_level_iter_.DeleteIter(false /* is_arena_mode */); @@ -43,7 +43,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { assert(Valid()); return second_level_iter_.key(); } - BlockHandle value() const override { + IndexValue value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -69,12 +69,12 @@ class TwoLevelIndexIterator : public InternalIteratorBase { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIteratorBase* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapperBase first_level_iter_; - IteratorWrapperBase second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. @@ -83,7 +83,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { TwoLevelIndexIterator::TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} void TwoLevelIndexIterator::Seek(const Slice& target) { @@ -177,8 +177,8 @@ void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { } void TwoLevelIndexIterator::SetSecondLevelIterator( - InternalIteratorBase* iter) { - InternalIteratorBase* old_iter = second_level_iter_.Set(iter); + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } @@ -186,14 +186,14 @@ void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - BlockHandle handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value().handle; if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIteratorBase* iter = + InternalIteratorBase* iter = state_->NewSecondaryIterator(handle); data_block_handle_ = handle; SetSecondLevelIterator(iter); @@ -203,9 +203,9 @@ void TwoLevelIndexIterator::InitDataBlock() { } // namespace -InternalIteratorBase* NewTwoLevelIterator( +InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) { + InternalIteratorBase* first_level_iter) { return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 55d5c01a4ae..545c29f493e 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -22,11 +22,10 @@ struct TwoLevelIteratorState { TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIteratorBase* NewSecondaryIterator( + virtual InternalIteratorBase* NewSecondaryIterator( const BlockHandle& handle) = 0; }; - // Return a new two level iterator. A two-level iterator contains an // index iterator whose values point to a sequence of blocks where // each block is itself a sequence of key,value pairs. The returned @@ -37,8 +36,8 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIteratorBase* NewTwoLevelIterator( +extern InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 4e37cde40d1..61a49d88a17 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -9,6 +9,7 @@ #include "test_util/testutil.h" +#include #include #include @@ -197,8 +198,12 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { opt.cache_index_and_filter_blocks = rnd->Uniform(2); opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); opt.pin_top_level_index_and_filter = rnd->Uniform(2); - opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch - : BlockBasedTableOptions::kHashSearch; + using IndexType = BlockBasedTableOptions::IndexType; + const std::array index_types = { + {IndexType::kBinarySearch, IndexType::kHashSearch, + IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}}; + opt.index_type = + index_types[rnd->Uniform(static_cast(index_types.size()))]; opt.hash_index_allow_collision = rnd->Uniform(2); opt.checksum = static_cast(rnd->Uniform(3)); opt.block_size = rnd->Uniform(10000000); diff --git a/util/coding.h b/util/coding.h index 4046a2b60bf..9427d52618e 100644 --- a/util/coding.h +++ b/util/coding.h @@ -58,6 +58,7 @@ extern bool GetFixed32(Slice* input, uint32_t* value); extern bool GetFixed16(Slice* input, uint16_t* value); extern bool GetVarint32(Slice* input, uint32_t* value); extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetVarsignedint64(Slice* input, int64_t* value); extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); // This function assumes data is well-formed. extern Slice GetLengthPrefixedSlice(const char* data); @@ -377,6 +378,18 @@ inline bool GetVarint64(Slice* input, uint64_t* value) { } } +inline bool GetVarsignedint64(Slice* input, int64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarsignedint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + // Provide an interface for platform independent endianness transformation inline uint64_t EndianTransform(uint64_t input, size_t size) { char* pos = reinterpret_cast(&input); From 9dbcda9e3b9b59b76b247e24e9ebc4b9263197ff Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Tue, 25 Jun 2019 22:58:56 -0700 Subject: [PATCH 181/572] Fix uninitialized prev_block_offset_ in BlockBasedTableReader (#5507) Summary: Found by valgrind_check. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5507 Differential Revision: D16002612 Pulled By: miasantreble fbshipit-source-id: 13c11c183190e0a0571844635457d434da3ac59a --- table/block_based/block_based_table_reader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 9300fb36a70..4356713910c 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -755,7 +755,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; BlockType block_type_; - uint64_t prev_block_offset_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); BlockCacheLookupContext lookup_context_; // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. From a8975b62455cb73a8e23ff6be709df1b97859d2d Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 27 Jun 2019 08:31:03 -0700 Subject: [PATCH 182/572] Block cache tracer: Do not populate block cache trace record when tracing is disabled. (#5510) Summary: This PR makes sure that trace record is not populated when tracing is disabled. Before this PR: DB path: [/data/mysql/rocks_regression_tests/OPTIONS-myrocks-40-33-10000000/2019-06-26-13-04-41/db] readwhilewriting : 9.803 micros/op 1550408 ops/sec; 107.9 MB/s (5000000 of 5000000 found) Microseconds per read: Count: 80000000 Average: 9.8045 StdDev: 12.64 Min: 1 Median: 7.5246 Max: 25343 Percentiles: P50: 7.52 P75: 12.10 P99: 37.44 P99.9: 75.07 P99.99: 133.60 After this PR: DB path: [/data/mysql/rocks_regression_tests/OPTIONS-myrocks-40-33-10000000/2019-06-26-14-08-21/db] readwhilewriting : 8.723 micros/op 1662882 ops/sec; 115.8 MB/s (5000000 of 5000000 found) Microseconds per read: Count: 80000000 Average: 8.7236 StdDev: 12.19 Min: 1 Median: 6.7262 Max: 25229 Percentiles: P50: 6.73 P75: 10.50 P99: 31.54 P99.9: 74.81 P99.99: 132.82 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5510 Differential Revision: D16016428 Pulled By: HaoyuHuang fbshipit-source-id: 3b3d11e6accf207d18ec2545b802aa01ee65901f --- table/block_based/block_based_table_reader.cc | 13 ++++++++----- trace_replay/block_cache_tracer.h | 4 ++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 5344625ec94..e73b0c08c41 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1967,7 +1967,8 @@ CachableEntry BlockBasedTable::GetFilter( } } - if (block_cache_tracer_ && lookup_context) { + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { // Avoid making copy of block_key and cf_name when constructing the access // record. BlockCacheTraceRecord access_record( @@ -2048,7 +2049,8 @@ CachableEntry BlockBasedTable::GetUncompressionDict( } } } - if (block_cache_tracer_ && lookup_context) { + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { // Avoid making copy of block_key and cf_name when constructing the access // record. BlockCacheTraceRecord access_record( @@ -2273,7 +2275,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } // Fill lookup_context. - if (block_cache_tracer_ && lookup_context) { + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { size_t usage = 0; uint64_t nkeys = 0; if (block_entry->GetValue()) { @@ -3167,7 +3170,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, s = biter.status(); } // Write the block cache access record. - if (block_cache_tracer_) { + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { // Avoid making copy of block_key, cf_name, and referenced_key when // constructing the access record. BlockCacheTraceRecord access_record( @@ -3334,7 +3337,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, s = biter.status(); } // Write the block cache access. - if (block_cache_tracer_) { + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { // Avoid making copy of block_key, cf_name, and referenced_key when // constructing the access record. BlockCacheTraceRecord access_record( diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index e7f38db3c6d..e2ad933b9b8 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -197,6 +197,10 @@ class BlockCacheTracer { // Stop writing block cache accesses to the trace_writer. void EndTrace(); + bool is_tracing_enabled() const { + return writer_.load(std::memory_order_relaxed); + } + Status WriteBlockAccess(const BlockCacheTraceRecord& record, const Slice& block_key, const Slice& cf_name, const Slice& referenced_key); From c08c0ae73131457a2ac74507da58ff49870c1ee6 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 27 Jun 2019 08:54:28 -0700 Subject: [PATCH 183/572] Add C binding for secondary instance (#5505) Summary: Add C binding for secondary instance as well as unit test. Test plan (on devserver) ``` $make clean && COMPILE_WITH_ASAN=1 make -j20 all $./c_test $make check ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5505 Differential Revision: D16000043 Pulled By: riversand963 fbshipit-source-id: 3361ef6bfdf4ce12438cee7290a0ac203b5250bd --- HISTORY.md | 1 + db/c.cc | 50 +++++++++++++++++++++++++++++++++++++++++ db/c_test.c | 54 +++++++++++++++++++++++++++++++++++++++++++++ include/rocksdb/c.h | 14 ++++++++++++ 4 files changed, 119 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 07eb2759736..d3660ee64ac 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,7 @@ * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. +* Add C bindings for secondary instance, i.e. DBImplSecondary. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/db/c.cc b/db/c.cc index 8f96366fbed..17dc766dd66 100644 --- a/db/c.cc +++ b/db/c.cc @@ -517,6 +517,21 @@ rocksdb_t* rocksdb_open_for_read_only( return result; } +rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, + const char* name, + const char* secondary_path, + char** errptr) { + DB* db; + if (SaveError(errptr, + DB::OpenAsSecondary(options->rep, std::string(name), + std::string(secondary_path), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr) { BackupEngine* be; @@ -717,6 +732,37 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( return result; } +rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* db_options, const char* name, + const char* secondary_path, int num_column_families, + const char** column_family_names, + const rocksdb_options_t** column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i != num_column_families; ++i) { + column_families.emplace_back( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep)); + } + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), + std::string(name), + std::string(secondary_path), + column_families, &handles, &db))) { + return nullptr; + } + for (size_t i = 0; i != handles.size(); ++i) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + char** rocksdb_list_column_families( const rocksdb_options_t* options, const char* name, @@ -3423,6 +3469,10 @@ void rocksdb_ingest_external_file_cf( SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep)); } +void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->TryCatchUpWithPrimary()); +} + rocksdb_slicetransform_t* rocksdb_slicetransform_create( void* state, void (*destructor)(void*), diff --git a/db/c_test.c b/db/c_test.c index 64241df287b..4b4b165c879 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -45,6 +45,7 @@ static char sstfilename[200]; static char dbbackupname[200]; static char dbcheckpointname[200]; static char dbpathname[200]; +static char secondary_path[200]; static void StartPhase(const char* name) { fprintf(stderr, "=== Test %s\n", name); @@ -1722,6 +1723,59 @@ int main(int argc, char** argv) { CheckNoError(err); } + // Check that secondary instance works. + StartPhase("open_as_secondary"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err); + rocksdb_t* db1; + rocksdb_options_t* opts = rocksdb_options_create(); + rocksdb_options_set_max_open_files(opts, -1); + rocksdb_options_set_create_if_missing(opts, 1); + snprintf(secondary_path, sizeof(secondary_path), + "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid())); + db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err); + CheckNoError(err); + + rocksdb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_disable_WAL(woptions, 1); + rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err); + CheckNoError(err); + rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_opts, 1); + rocksdb_flush(db, flush_opts, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(ropts, 1); + rocksdb_readoptions_set_snapshot(ropts, NULL); + CheckGet(db, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key0", "value0"); + + rocksdb_writeoptions_disable_WAL(woptions, 0); + rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + CheckGet(db1, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key1", "value1"); + + rocksdb_close(db1); + rocksdb_destroy_db(opts, secondary_path, &err); + CheckNoError(err); + + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(opts); + rocksdb_readoptions_destroy(ropts); + rocksdb_flushoptions_destroy(flush_opts); + } + // Simple sanity check that options setting db_paths work. StartPhase("open_db_paths"); { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 5e75dd70964..e8cb3224248 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -138,6 +138,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( const rocksdb_options_t* options, const char* name, unsigned char error_if_log_file_exist, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr); @@ -218,6 +222,13 @@ rocksdb_open_for_read_only_column_families( rocksdb_column_family_handle_t** column_family_handles, unsigned char error_if_log_file_exist, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, int num_column_families, + const char** column_family_names, + const rocksdb_options_t** column_family_options, + rocksdb_column_family_handle_t** colummn_family_handles, char** errptr); + extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families( const rocksdb_options_t* options, const char* name, size_t* lencf, char** errptr); @@ -1375,6 +1386,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf( const char* const* file_list, const size_t list_len, const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary( + rocksdb_t* db, char** errptr); + /* SliceTransform */ extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* From 15fd3be07bd7a6fa29604277e9a9be21f458c426 Mon Sep 17 00:00:00 2001 From: sdong Date: Thu, 27 Jun 2019 10:16:21 -0700 Subject: [PATCH 184/572] LRU Cache to enable mid-point insertion by default (#5508) Summary: Mid-point insertion is a useful feature and is mature now. Make it default. Also changed cache_index_and_filter_blocks_with_high_priority=true as default accordingly, so that we won't evict index and filter blocks easier after the change, to avoid too many surprises to users. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5508 Test Plan: Run all existing tests. Differential Revision: D16021179 fbshipit-source-id: ce8456e8d43b3bfb48df6c304b5290a9d19817eb --- HISTORY.md | 4 ++ cache/cache_test.cc | 2 +- include/rocksdb/cache.h | 4 +- include/rocksdb/table.h | 2 +- options/options_test.cc | 39 ++++++++++--------- .../block_based/block_based_table_factory.cc | 7 +++- table/block_based/block_based_table_reader.cc | 27 +++++++------ 7 files changed, 48 insertions(+), 37 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d3660ee64ac..79feac37cbb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # Rocksdb Change Log ## Unreleased +### Default Option Change +* LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0. +* Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. + ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Partitions of partitioned indexes no longer affect the read amplification statistics. diff --git a/cache/cache_test.cc b/cache/cache_test.cc index d7b191bb31f..46ce78db68f 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -90,7 +90,7 @@ class CacheTest : public testing::TestWithParam { bool strict_capacity_limit) { auto type = GetParam(); if (type == kLRU) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, 0.0); } if (type == kClock) { return NewClockCache(capacity, num_shard_bits, strict_capacity_limit); diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 8fb691559d0..410c2cf827a 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -59,7 +59,7 @@ struct LRUCacheOptions { // // See also // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority. - double high_pri_pool_ratio = 0.0; + double high_pri_pool_ratio = 0.5; // If non-nullptr will use this allocator instead of system allocator when // allocating memory for cache blocks. Call this method before you start using @@ -99,7 +99,7 @@ struct LRUCacheOptions { // will be at least 512KB and number of shard bits will not exceed 6. extern std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits = -1, - bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, std::shared_ptr memory_allocator = nullptr, bool use_adaptive_mutex = kDefaultToAdaptiveMutex); diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 929239100a4..712c604ad35 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -74,7 +74,7 @@ struct BlockBasedTableOptions { // blocks with high priority. If set to true, depending on implementation of // block cache, index and filter blocks may be less likely to be evicted // than data blocks. - bool cache_index_and_filter_blocks_with_high_priority = false; + bool cache_index_and_filter_blocks_with_high_priority = true; // if cache_index_and_filter_blocks is true and the below is true, then // filter and index blocks are stored in the cache, but a reference is diff --git a/options/options_test.cc b/options/options_test.cc index 24aeec99e17..823a9c1e054 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -617,8 +617,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { new_opt.block_cache)->GetNumShardBits(), GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity())); ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache)->GetHighPriPoolRatio(), 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL); // Default values @@ -627,16 +628,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { GetDefaultCacheShardBits( new_opt.block_cache_compressed->GetCapacity())); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); // Set couple of block cache options. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" - "block_cache_compressed={num_shard_bits=5;" - "high_pri_pool_ratio=0.5;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + table_opt, + "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={num_shard_bits=5;" + "high_pri_pool_ratio=0.0;}", + &new_opt)); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetNumShardBits(), 5); @@ -648,9 +650,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache_compressed)->GetNumShardBits(), 5); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.5); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.0); // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, @@ -664,16 +666,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetNumShardBits(), 4); ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache)->GetHighPriPoolRatio(), 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache_compressed)->GetNumShardBits(), 4); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); } #endif // !ROCKSDB_LITE diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 96812e233b8..9dca2a6f0c1 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -167,7 +167,12 @@ BlockBasedTableFactory::BlockBasedTableFactory( if (table_options_.no_block_cache) { table_options_.block_cache.reset(); } else if (table_options_.block_cache == nullptr) { - table_options_.block_cache = NewLRUCache(8 << 20); + LRUCacheOptions co; + co.capacity = 8 << 20; + // It makes little sense to pay overhead for mid-point insertion while the + // block size is only 8MB. + co.high_pri_pool_ratio = 0.0; + table_options_.block_cache = NewLRUCache(co); } if (table_options_.block_size_deviation < 0 || table_options_.block_size_deviation > 100) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index e73b0c08c41..017d6126c2b 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2626,12 +2626,11 @@ void BlockBasedTableIterator::SeekImpl( CheckOutOfBound(); if (target) { - assert( - !Valid() || - ((block_type_ == BlockType::kIndex && - !table_->get_rep()->index_key_includes_seq) - ? (user_comparator_.Compare(ExtractUserKey(*target), key()) <= 0) - : (icomp_.Compare(*target, key()) <= 0))); + assert(!Valid() || ((block_type_ == BlockType::kIndex && + !table_->get_rep()->index_key_includes_seq) + ? (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0) + : (icomp_.Compare(*target, key()) <= 0))); } } @@ -2954,8 +2953,8 @@ InternalIterator* BlockBasedTable::NewIterator( /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, - caller, compaction_readahead_size); + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); @@ -2966,8 +2965,8 @@ InternalIterator* BlockBasedTable::NewIterator( &lookup_context), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, - caller, compaction_readahead_size); + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); } } @@ -3125,8 +3124,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, DataBlockIter biter; uint64_t referenced_data_size = 0; NewDataBlockIterator( - read_options, v.handle, &biter, BlockType::kData, - get_context, &lookup_data_block_context, + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, /*s=*/Status(), /*prefetch_buffer*/ nullptr); if (no_io && biter.status().IsIncomplete()) { @@ -3278,8 +3277,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, offset = iiter->value().handle.offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, v.handle, &biter, BlockType::kData, - get_context, &lookup_data_block_context, Status(), nullptr); + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, Status(), nullptr); reusing_block = false; } From 5c2f13fb14540f8b57337120811bf755e132c6fb Mon Sep 17 00:00:00 2001 From: Aaron Gao Date: Thu, 27 Jun 2019 11:08:45 -0700 Subject: [PATCH 185/572] add create_column_family and drop_column_family cmd to ldb tool (#5503) Summary: `create_column_family` cmd already exists but was somehow missed in the help message. also add `drop_column_family` cmd which can drop a cf without opening db. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5503 Test Plan: Updated existing ldb_test.py to test deleting a column family. Differential Revision: D16018414 Pulled By: lightmark fbshipit-source-id: 1fc33680b742104fea86b10efc8499f79e722301 --- tools/ldb_cmd.cc | 42 ++++++++++++++++++++++++++++++++++++++++++ tools/ldb_cmd_impl.h | 17 +++++++++++++++++ tools/ldb_test.py | 2 ++ tools/ldb_tool.cc | 2 ++ 4 files changed, 63 insertions(+) diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 49489173c33..a1507b188b2 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -223,6 +223,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { return new CreateColumnFamilyCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) { + return new DropColumnFamilyCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); } else if (parsed_params.cmd == DBFileDumperCommand::Name()) { return new DBFileDumperCommand(parsed_params.cmd_params, parsed_params.option_map, @@ -1125,6 +1129,44 @@ void CreateColumnFamilyCommand::DoCommand() { CloseDB(); } +void DropColumnFamilyCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DropColumnFamilyCommand::Name()); + ret.append(" --db= "); + ret.append("\n"); +} + +DropColumnFamilyCommand::DropColumnFamilyCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, true, {ARG_DB}) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "The name of column family to drop must be specified"); + } else { + cf_name_to_drop_ = params[0]; + } +} + +void DropColumnFamilyCommand::DoCommand() { + auto iter = cf_handles_.find(cf_name_to_drop_); + if (iter == cf_handles_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Column family: " + cf_name_to_drop_ + " doesn't exist in db."); + return; + } + ColumnFamilyHandle* cf_handle_to_drop = iter->second; + Status st = db_->DropColumnFamily(cf_handle_to_drop); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + "Fail to drop column family: " + st.ToString()); + } + CloseDB(); +} + // ---------------------------------------------------------------------------- namespace { diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index 868c81f44c8..23bafe68254 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -205,6 +205,23 @@ class CreateColumnFamilyCommand : public LDBCommand { std::string new_cf_name_; }; +class DropColumnFamilyCommand : public LDBCommand { + public: + static std::string Name() { return "drop_column_family"; } + + DropColumnFamilyCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return false; } + + private: + std::string cf_name_to_drop_; +}; + class ReduceDBLevelsCommand : public LDBCommand { public: static std::string Name() { return "reduce_levels"; } diff --git a/tools/ldb_test.py b/tools/ldb_test.py index e64e76ee731..26167ee83fd 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -553,8 +553,10 @@ def testColumnFamilies(self): "1") self.assertRunOK("get cf3_1 --column_family=three", "3") + self.assertRunOK("drop_column_family three", "OK") # non-existing column family. self.assertRunFAIL("get cf3_1 --column_family=four") + self.assertRunFAIL("drop_column_family four") def testIngestExternalSst(self): print "Running testIngestExternalSst..." diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index fe307eab7dc..2813f6c6edf 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -82,6 +82,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, DBLoaderCommand::Help(ret); ManifestDumpCommand::Help(ret); ListColumnFamiliesCommand::Help(ret); + CreateColumnFamilyCommand::Help(ret); + DropColumnFamilyCommand::Help(ret); DBFileDumperCommand::Help(ret); InternalDumpCommand::Help(ret); RepairCommand::Help(ret); From 10bae8ceb39db5cb332cbf24f6eec60d8b7d7f20 Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 28 Jun 2019 17:38:34 -0700 Subject: [PATCH 186/572] Add more release versions to tools/check_format_compatible.sh (#5518) Summary: tools/check_format_compatible.sh is lagged behind. Catch up. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5518 Test Plan: Run the command Differential Revision: D16063180 fbshipit-source-id: d063eb42df9653dec06a2cf0fb982b8a60ca3d2f --- tools/check_format_compatible.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 098f0d555a9..444c1111a73 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -55,9 +55,9 @@ EOF declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb") declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb") -declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb") +declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb") declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]}) -declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb") +declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb") generate_db() { From 68b46a2e3699180609b65c2529b86b067bd1829d Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Sun, 30 Jun 2019 19:54:28 -0700 Subject: [PATCH 187/572] Block cache tracer: StartTrace return busy if trace is already started. (#5519) Summary: This PR is needed for integration into MyRocks. A second call on StartTrace returns Busy so that MyRocks may return an error to the user. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5519 Test Plan: make clean && USE_CLANG=1 make check -j32 Differential Revision: D16055476 Pulled By: HaoyuHuang fbshipit-source-id: a51772fb0965c873922757eb470a332b1e02a91d --- trace_replay/block_cache_tracer.cc | 2 +- trace_replay/block_cache_tracer_test.cc | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 4c5ad011609..b163216d874 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -233,7 +233,7 @@ Status BlockCacheTracer::StartTrace( std::unique_ptr&& trace_writer) { InstrumentedMutexLock lock_guard(&trace_writer_mutex_); if (writer_.load()) { - return Status::OK(); + return Status::Busy(); } trace_options_ = trace_options; writer_.store( diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index 44cba7bfbd8..e7a5881044f 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -195,6 +195,17 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) { } } +TEST_F(BlockCacheTracerTest, ConsecutiveStartTrace) { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK( + NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer)); + BlockCacheTracer writer; + ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_NOK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_OK(env_->FileExists(trace_file_path_)); +} + TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) { BlockCacheTraceRecord record = GenerateAccessRecord(); { From 7259e28d915af72dd0cd6d055ab966644d83dd68 Mon Sep 17 00:00:00 2001 From: anand76 Date: Sun, 30 Jun 2019 20:52:34 -0700 Subject: [PATCH 188/572] MultiGet parallel IO (#5464) Summary: Enhancement to MultiGet batching to read data blocks required for keys in a batch in parallel from disk. It uses Env::MultiRead() API to read multiple blocks and reduce latency. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5464 Test Plan: 1. make check 2. make asan_check 3. make asan_crash Differential Revision: D15911771 Pulled By: anand1976 fbshipit-source-id: 605036b9af0f90ca0020dc87c3a86b4da6e83394 --- HISTORY.md | 1 + db/db_basic_test.cc | 289 ++++++++++ table/block_based/block_based_table_reader.cc | 519 +++++++++++++++--- table/block_based/block_based_table_reader.h | 23 +- table/format.h | 2 + util/file_reader_writer.cc | 43 ++ util/file_reader_writer.h | 2 + 7 files changed, 812 insertions(+), 67 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 79feac37cbb..2c8dc8c3ab9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -25,6 +25,7 @@ * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases * Log Writer will flush after finishing the whole record, rather than a fragment. +* Lower MultiGet batching API latency by reading data blocks from disk in parallel ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 1aec864dd6f..66d3b3aff7c 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -10,6 +10,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" +#include "table/block_based/block_builder.h" #include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" @@ -1285,6 +1286,294 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { } } +class DBBasicTestWithParallelIO + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBBasicTestWithParallelIO() + : DBTestBase("/db_basic_test_with_parallel_io") { + bool compressed_cache = std::get<0>(GetParam()); + bool uncompressed_cache = std::get<1>(GetParam()); + compression_enabled_ = std::get<2>(GetParam()); + fill_cache_ = std::get<3>(GetParam()); + + if (compressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + compressed_cache_ = std::make_shared(cache); + } + if (uncompressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + uncompressed_cache_ = std::make_shared(cache); + } + + env_->count_random_reads_ = true; + + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.block_cache = uncompressed_cache_; + table_options.block_cache_compressed = compressed_cache_; + table_options.flush_block_policy_factory.reset( + new MyFlushBlockPolicyFactory()); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + if (!compression_enabled_) { + options.compression = kNoCompression; + } + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + values_.emplace_back(RandomString(&rnd, 128) + zero_str); + assert(Put(Key(i), values_[i]) == Status::OK()); + } + Flush(); + } + + bool CheckValue(int i, const std::string& value) { + if (values_[i].compare(value) == 0) { + return true; + } + return false; + } + + int num_lookups() { return uncompressed_cache_->num_lookups(); } + int num_found() { return uncompressed_cache_->num_found(); } + int num_inserts() { return uncompressed_cache_->num_inserts(); } + + int num_lookups_compressed() { + return compressed_cache_->num_lookups(); + } + int num_found_compressed() { + return compressed_cache_->num_found(); + } + int num_inserts_compressed() { + return compressed_cache_->num_inserts(); + } + + bool fill_cache() { return fill_cache_; } + + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + private: + class MyFlushBlockPolicyFactory + : public FlushBlockPolicyFactory { + public: + MyFlushBlockPolicyFactory() {} + + virtual const char* Name() const override { + return "MyFlushBlockPolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& data_block_builder) const override { + return new MyFlushBlockPolicy(data_block_builder); + } + }; + + class MyFlushBlockPolicy + : public FlushBlockPolicy { + public: + explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder) + : num_keys_(0), data_block_builder_(data_block_builder) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (data_block_builder_.empty()) { + // First key in this block + num_keys_ = 1; + return false; + } + // Flush every 10 keys + if (num_keys_ == 10) { + num_keys_ = 1; + return true; + } + num_keys_++; + return false; + } + + private: + int num_keys_; + const BlockBuilder& data_block_builder_; + }; + + class MyBlockCache + : public Cache { + public: + explicit MyBlockCache(std::shared_ptr& target) + : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {} + + virtual const char* Name() const override { return "MyBlockCache"; } + + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + num_inserts_++; + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + virtual Handle* Lookup(const Slice& key, + Statistics* stats = nullptr) override { + num_lookups_++; + Handle* handle = target_->Lookup(key, stats); + if (handle != nullptr) { + num_found_++; + } + return handle; + } + + virtual bool Ref(Handle* handle) override { + return target_->Ref(handle); + } + + virtual bool Release(Handle* handle, bool force_erase = false) override { + return target_->Release(handle, force_erase); + } + + virtual void* Value(Handle* handle) override { + return target_->Value(handle); + } + + virtual void Erase(const Slice& key) override { + target_->Erase(key); + } + virtual uint64_t NewId() override { + return target_->NewId(); + } + + virtual void SetCapacity(size_t capacity) override { + target_->SetCapacity(capacity); + } + + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + virtual bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + virtual size_t GetCapacity() const override { + return target_->GetCapacity(); + } + + virtual size_t GetUsage() const override { + return target_->GetUsage(); + } + + virtual size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + virtual size_t GetPinnedUsage() const override { + return target_->GetPinnedUsage(); + } + + virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; } + + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + return target_->ApplyToAllCacheEntries(callback, thread_safe); + } + + virtual void EraseUnRefEntries() override { + return target_->EraseUnRefEntries(); + } + + int num_lookups() { return num_lookups_; } + + int num_found() { return num_found_; } + + int num_inserts() { return num_inserts_; } + private: + std::shared_ptr target_; + int num_lookups_; + int num_found_; + int num_inserts_; + }; + + std::shared_ptr compressed_cache_; + std::shared_ptr uncompressed_cache_; + bool compression_enabled_; + std::vector values_; + bool fill_cache_; +}; + +TEST_P(DBBasicTestWithParallelIO, MultiGet) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + int expected_reads = random_reads + (fill_cache() ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_ints{1,2,15,16,55,81,82,83,84,85}; + for (size_t i = 0; i < key_ints.size(); ++i) { + key_data[i] = Key(key_ints[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_ints.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString())); + } + expected_reads += (fill_cache() ? 2 : 4); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); +} + +INSTANTIATE_TEST_CASE_P( + ParallelIO, DBBasicTestWithParallelIO, + // Params are as follows - + // Param 0 - Compressed cache enabled + // Param 1 - Uncompressed cache enabled + // Param 2 - Data compression enabled + // Param 3 - ReadOptions::fill_cache + ::testing::Values(std::make_tuple(false, true, true, true), + std::make_tuple(true, true, true, true), + std::make_tuple(false, true, false, true), + std::make_tuple(false, true, true, false), + std::make_tuple(true, true, true, false), + std::make_tuple(false, true, false, false))); + class DBBasicTestWithTimestampWithParam : public DBTestBase, public testing::WithParamInterface { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 017d6126c2b..edddecf78bd 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -160,6 +160,13 @@ bool PrefixExtractorChanged(const TableProperties* table_properties, } } +CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { + CacheAllocationPtr heap_buf; + heap_buf = AllocateBlock(buf.size(), allocator); + memcpy(heap_buf.get(), buf.data(), buf.size()); + return heap_buf; +} + } // namespace // Encapsulates common functionality for the various index reader @@ -421,7 +428,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context); + &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, + /*contents=*/nullptr); assert(s.ok() || block.GetValue() == nullptr); if (s.ok() && block.GetValue() != nullptr) { @@ -1745,8 +1753,6 @@ Status BlockBasedTable::PutDataBlockToCache( : Cache::Priority::LOW; assert(cached_block); assert(cached_block->IsEmpty()); - assert(raw_block_comp_type == kNoCompression || - block_cache_compressed != nullptr); Status s; Statistics* statistics = ioptions.statistics; @@ -2195,11 +2201,105 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( rep->index_value_is_full, block_contents_pinned); } +// Convert an uncompressed data block (i.e CachableEntry) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator( + const ReadOptions& ro, CachableEntry& block, TBlockIter* input_iter, + Status s) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep_->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep_->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + return iter; +} + +// Lookup the cache for the given data block referenced by an index iterator +// value (i.e BlockHandle). If it exists in the cache, initialize block to +// the contents of the data block. +Status BlockBasedTable::GetDataBlockFromCache( + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block, BlockType block_type, + GetContext* get_context) const { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict, block, + block_type, get_context, &lookup_data_block_context); + if (s.IsIncomplete()) { + s = Status::OK(); + } + + return s; +} + +// If contents is nullptr, this function looks up the block caches for the +// data block referenced by handle, and read the block from disk if necessary. +// If contents is non-null, it skips the cache lookup and disk read, since +// the caller has already read it. In both cases, if ro.fill_cache is true, +// it inserts the block into the block cache. Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); @@ -2231,14 +2331,17 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( compressed_cache_key); } - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - ro, block_entry, uncompression_dict, block_type, - get_context); - if (block_entry->GetValue()) { - // TODO(haoyu): Differentiate cache hit on uncompressed block cache and - // compressed block cache. - is_cache_hit = true; + if (!contents) { + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + ro, block_entry, uncompression_dict, block_type, + get_context); + if (block_entry->GetValue()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + } } + // Can't find the block from the cache. If I/O is allowed, read from the // file. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { @@ -2248,7 +2351,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; CompressionType raw_block_comp_type; BlockContents raw_block_contents; - { + if (!contents) { StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, @@ -2259,6 +2362,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( GetMemoryAllocatorForCompressedBlock(rep_->table_options)); s = block_fetcher.ReadBlockContents(); raw_block_comp_type = block_fetcher.get_compression_type(); + contents = &raw_block_contents; + } else { + raw_block_comp_type = contents->get_compression_type(); } if (s.ok()) { @@ -2266,7 +2372,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( // If filling cache is allowed and a cache is configured, try to put the // block to the cache. s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, - block_entry, &raw_block_contents, + block_entry, contents, raw_block_comp_type, uncompression_dict, seq_no, GetMemoryAllocator(rep_->table_options), block_type, get_context); @@ -2331,6 +2437,172 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( return s; } +// This function reads multiple data blocks from disk using Env::MultiRead() +// and optionally inserts them into the block cache. It uses the scratch +// buffer provided by the caller, which is contiguous. If scratch is a nullptr +// it allocates a separate buffer for each block. Typically, if the blocks +// need to be uncompressed and there is no compressed block cache, callers +// can allocate a temporary scratch buffer in order to minimize memory +// allocations. +// If options.fill_cache is true, it inserts the blocks into cache. If its +// false and scratch is non-null and the blocks are uncompressed, it copies +// the buffers to heap. In any case, the CachableEntry returned will +// own the data bytes. +// batch - A MultiGetRange with only those keys with unique data blocks not +// found in cache +// handles - A vector of block handles. Some of them me be NULL handles +// scratch - An optional contiguous buffer to read compressed blocks into +void BlockBasedTable::MaybeLoadBlocksToCache( + const ReadOptions& options, + const MultiGetRange* batch, + const autovector* handles, + autovector* statuses, + autovector< + CachableEntry, MultiGetContext::MAX_BATCH_SIZE>* results, + char* scratch, + const UncompressionDict& uncompression_dict) const { + + RandomAccessFileReader* file = rep_->file.get(); + const Footer& footer = rep_->footer; + const ImmutableCFOptions& ioptions = rep_->ioptions; + SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData); + size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; + MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); + + if (file->use_direct_io() || ioptions.allow_mmap_reads) { + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + (*statuses)[idx_in_batch] = RetrieveBlock(nullptr, options, handle, + uncompression_dict, &(*results)[idx_in_batch], BlockType::kData, + mget_iter->get_context, &lookup_data_block_context); + } + return; + } + + autovector read_reqs; + size_t buf_offset = 0; + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + ReadRequest req; + req.len = handle.size() + kBlockTrailerSize; + if (scratch == nullptr) { + req.scratch = new char[req.len]; + } else { + req.scratch = scratch + buf_offset; + buf_offset += req.len; + } + req.offset = handle.offset(); + req.status = Status::OK(); + read_reqs.emplace_back(req); + } + + file->MultiRead(&read_reqs[0], read_reqs.size()); + + size_t read_req_idx = 0; + idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + + if (handle.IsNull()) { + continue; + } + + ReadRequest& req = read_reqs[read_req_idx++]; + Status s = req.status; + if (s.ok()) { + if (req.result.size() != handle.size() + kBlockTrailerSize) { + s = Status::Corruption("truncated block read from " + + rep_->file->file_name() + " offset " + + ToString(handle.offset()) + ", expected " + + ToString(handle.size() + kBlockTrailerSize) + + " bytes, got " + ToString(req.result.size())); + } + } + + BlockContents raw_block_contents; + if (s.ok()) { + if (scratch == nullptr) { + // We allocated a buffer for this block. Give ownership of it to + // BlockContents so it can free the memory + assert(req.result.data() == req.scratch); + std::unique_ptr raw_block(req.scratch); + raw_block_contents = BlockContents(std::move(raw_block), + handle.size()); + } else { + // We used the scratch buffer, so no need to free anything + raw_block_contents = BlockContents(Slice(req.scratch, + handle.size())); + } +#ifndef NDEBUG + raw_block_contents.is_raw_block = true; +#endif + if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); + const char* data = req.result.data(); + uint32_t expected = DecodeFixed32(data + handle.size() + 1); + s = rocksdb::VerifyChecksum(footer.checksum(), req.result.data(), + handle.size() + 1, expected); + } + } + if (s.ok()) { + if (options.fill_cache) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + CachableEntry* block_entry = &(*results)[idx_in_batch]; + // MaybeReadBlockAndLoadToCache will insert into the block caches if + // necessary. Since we're passing the raw block contents, it will + // avoid looking up the block cache + s = MaybeReadBlockAndLoadToCache(nullptr, options, handle, + uncompression_dict, block_entry, BlockType::kData, + mget_iter->get_context, &lookup_data_block_context, + &raw_block_contents); + } else { + CompressionType compression_type = + raw_block_contents.get_compression_type(); + BlockContents contents; + if (compression_type != kNoCompression) { + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents(info, req.result.data(), handle.size(), + &contents, footer.version(), rep_->ioptions, + memory_allocator); + } else { + if (scratch != nullptr) { + // If we used the scratch buffer, then the contents need to be + // copied to heap + Slice raw = Slice(req.result.data(), handle.size()); + contents = BlockContents(CopyBufferToHeap( + GetMemoryAllocator(rep_->table_options), raw), + handle.size()); + } else { + contents = std::move(raw_block_contents); + } + } + if (s.ok()) { + (*results)[idx_in_batch].SetOwnedValue(new Block(std::move(contents), + global_seqno, read_amp_bytes_per_bit, ioptions.statistics)); + } + } + } + (*statuses)[idx_in_batch] = s; + } +} + Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, @@ -2347,7 +2619,8 @@ Status BlockBasedTable::RetrieveBlock( block_type != BlockType::kIndex)) { s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, uncompression_dict, block_entry, - block_type, get_context, lookup_context); + block_type, get_context, lookup_context, + /*contents=*/nullptr); if (!s.ok()) { return s; @@ -3248,8 +3521,101 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, iiter_unique_ptr.reset(iiter); } - DataBlockIter biter; uint64_t offset = std::numeric_limits::max(); + autovector block_handles; + autovector, MultiGetContext::MAX_BATCH_SIZE> results; + autovector statuses; + static const size_t kMultiGetReadStackBufSize = 8192; + char stack_buf[kMultiGetReadStackBufSize]; + std::unique_ptr block_buf; + { + MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), + sst_file_range.end()); + BlockCacheLookupContext lookup_compression_dict_context( + TableReaderCaller::kUserMultiGet); + auto uncompression_dict_storage = GetUncompressionDict(nullptr, no_io, + sst_file_range.begin()->get_context, + &lookup_compression_dict_context); + const UncompressionDict& uncompression_dict = + uncompression_dict_storage.GetValue() == nullptr + ? UncompressionDict::GetEmptyDict() + : *uncompression_dict_storage.GetValue(); + size_t total_len = 0; + ReadOptions ro = read_options; + ro.read_tier = kBlockCacheTier; + + for (auto miter = data_block_range.begin(); + miter != data_block_range.end(); ++miter) { + const Slice& key = miter->ikey; + iiter->Seek(miter->ikey); + + IndexValue v; + if (iiter->Valid()) { + v = iiter->value(); + } + if (!iiter->Valid() || + (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + *(miter->s) = iiter->status(); + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + statuses.emplace_back(); + results.emplace_back(); + if (v.handle.offset() == offset) { + // We're going to reuse the block for this key later on. No need to + // look it up now. Place a null handle + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + continue; + } + offset = v.handle.offset(); + BlockHandle handle = v.handle; + Status s = GetDataBlockFromCache(ro, handle, uncompression_dict, + &(results.back()), BlockType::kData, miter->get_context); + if (s.ok() && !results.back().IsEmpty()) { + // Found it in the cache. Add NULL handle to indicate there is + // nothing to read from disk + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + } else { + block_handles.emplace_back(handle); + total_len += handle.size(); + } + } + + if (total_len) { + char* scratch = nullptr; + // If the blocks need to be uncompressed and we don't need the + // compressed blocks, then we can use a contiguous block of + // memory to read in all the blocks as it will be temporary + // storage + // 1. If blocks are compressed and compressed block cache is there, + // alloc heap bufs + // 2. If blocks are uncompressed, alloc heap bufs + // 3. If blocks are compressed and no compressed block cache, use + // stack buf + if (rep_->table_options.block_cache_compressed == nullptr && + rep_->blocks_maybe_compressed) { + if (total_len <= kMultiGetReadStackBufSize) { + scratch = stack_buf; + } else { + scratch = new char[total_len]; + block_buf.reset(scratch); + } + } + MaybeLoadBlocksToCache(read_options, + &data_block_range, &block_handles, &statuses, &results, + scratch, uncompression_dict); + } + } + + DataBlockIter first_biter; + DataBlockIter next_biter; + size_t idx_in_batch = 0; for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); ++miter) { Status s; @@ -3257,83 +3623,97 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const Slice& key = miter->ikey; bool matched = false; // if such user key matched a key in SST bool done = false; - for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - IndexValue v = iiter->value(); - if (!v.first_internal_key.empty() && !skip_filters && - UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0) { - // The requested key falls between highest key in previous block and - // lowest key in current block. - break; - } - + bool first_block = true; + do { + DataBlockIter* biter = nullptr; bool reusing_block = true; uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( TableReaderCaller::kUserMultiGet); - if (iiter->value().handle.offset() != offset) { - offset = iiter->value().handle.offset(); - biter.Invalidate(Status::OK()); + if (first_block) { + if (!block_handles[idx_in_batch].IsNull() || + !results[idx_in_batch].IsEmpty()) { + first_biter.Invalidate(Status::OK()); + NewDataBlockIterator( + read_options, results[idx_in_batch], &first_biter, + statuses[idx_in_batch]); + reusing_block = false; + } + biter = &first_biter; + idx_in_batch++; + } else { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + next_biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, v.handle, &biter, BlockType::kData, get_context, - &lookup_data_block_context, Status(), nullptr); + read_options, iiter->value().handle, &next_biter, + BlockType::kData, get_context, &lookup_data_block_context, + Status(), nullptr); + biter = &next_biter; reusing_block = false; } if (read_options.read_tier == kBlockCacheTier && - biter.status().IsIncomplete()) { + biter->status().IsIncomplete()) { // couldn't get block from block_cache // Update Saver.state to Found because we are only looking for // whether we can guarantee the key is not there when "no_io" is set get_context->MarkKeyMayExist(); break; } - if (!biter.status().ok()) { - s = biter.status(); + if (!biter->status().ok()) { + s = biter->status(); break; } - bool may_exist = biter.SeekForGet(key); + bool may_exist = biter->SeekForGet(key); if (!may_exist) { // HashSeek cannot find the key this block and the the iter is not // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break // from the top level for-loop. - done = true; - } else { - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - Cleanable dummy; - Cleanable* value_pinner = nullptr; - - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - if (biter.IsValuePinned()) { - if (reusing_block) { - Cache* block_cache = rep_->table_options.block_cache.get(); - assert(biter.cache_handle() != nullptr); - block_cache->Ref(biter.cache_handle()); - dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, - biter.cache_handle()); - value_pinner = &dummy; - } else { - value_pinner = &biter; - } - } + break; + } - if (!get_context->SaveValue(parsed_key, biter.value(), &matched, - value_pinner)) { - does_referenced_key_exist = true; - referenced_data_size = biter.key().size() + biter.value().size(); - done = true; - break; + // Call the *saver function on each entry/block until it returns false + for (; biter->Valid(); biter->Next()) { + ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + + if (!ParseInternalKey(biter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + if (biter->IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter->cache_handle() != nullptr); + block_cache->Ref(biter->cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter->cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = biter; } } - s = biter.status(); + + if (!get_context->SaveValue( + parsed_key, biter->value(), &matched, value_pinner)) { + does_referenced_key_exist = true; + referenced_data_size = biter->key().size() + biter->value().size(); + done = true; + break; + } + s = biter->status(); } // Write the block cache access. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { @@ -3354,11 +3734,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, access_record, lookup_data_block_context.block_key, rep_->cf_name_for_tracing(), key); } + s = biter->status(); if (done) { // Avoid the extra Next which is expensive in two-level indexes break; } - } + if (first_block) { + iiter->Seek(key); + } + first_block = false; + iiter->Next(); + } while (iiter->Valid()); + if (matched && filter != nullptr && !filter->IsBlockBased()) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 4356713910c..358bc8b8d22 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -233,6 +233,12 @@ class BlockBasedTable : public TableReader { BlockCacheLookupContext* lookup_context, Status s, FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; + // input_iter: if it is not null, update this one and return it as Iterator + template + TBlockIter* NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, Status s) const; + class PartitionedIndexIteratorState; friend class PartitionIndexReader; @@ -276,7 +282,8 @@ class BlockBasedTable : public TableReader { FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context) const; + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the @@ -289,6 +296,20 @@ class BlockBasedTable : public TableReader { BlockCacheLookupContext* lookup_context, bool for_compaction = false) const; + Status GetDataBlockFromCache( + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context) const; + + void MaybeLoadBlocksToCache( + const ReadOptions& options, const MultiGetRange* batch, + const autovector* handles, + autovector* statuses, + autovector< + CachableEntry, MultiGetContext::MAX_BATCH_SIZE>* results, + char* scratch, const UncompressionDict& uncompression_dict) const; + // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file // were they not present in cache yet. diff --git a/table/format.h b/table/format.h index 539ca88805c..effc13addaf 100644 --- a/table/format.h +++ b/table/format.h @@ -26,7 +26,9 @@ #include "options/cf_options.h" #include "port/port.h" // noexcept #include "table/persistent_cache_options.h" +#include "util/crc32c.h" #include "util/file_reader_writer.h" +#include "util/xxhash.h" namespace rocksdb { diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index bf88503339a..f49866d13e7 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -192,6 +192,49 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, return s; } +Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs, + size_t num_reqs) const { + Status s; + uint64_t elapsed = 0; + assert(!use_direct_io()); + assert(!for_compaction_); + { + StopWatch sw(env_, stats_, hist_type_, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } +#endif // ROCKSDB_LITE + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->MultiRead(read_reqs, num_reqs); + } + for (size_t i = 0; i < num_reqs; ++i) { +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(read_reqs[i].offset, + read_reqs[i].result.size(), start_ts, finish_ts, + read_reqs[i].status); + } +#endif // ROCKSDB_LITE + IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size()); + } + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return s; +} + Status WritableFileWriter::Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 01df1067ed9..0a7e5032d2f 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -161,6 +161,8 @@ class RandomAccessFileReader { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, bool for_compaction = false) const; + Status MultiRead(ReadRequest* reqs, size_t num_reqs) const; + Status Prefetch(uint64_t offset, size_t n) const { return file_->Prefetch(offset, n); } From c36067575037573a1ee3980bf8c27a93b4cf0694 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 1 Jul 2019 11:45:12 -0700 Subject: [PATCH 189/572] Add secondary instance to stress test (#5479) Summary: This PR allows users to run stress tests on secondary instance. Test plan (on devserver) ``` ./db_stress -ops_per_thread=100000 -enable_secondary=true -threads=32 -secondary_catch_up_one_in=10000 -clear_column_family_one_in=1000 -reopen=100 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5479 Differential Revision: D16074325 Pulled By: riversand963 fbshipit-source-id: c0ed959e7b6c7cda3efd0b3070ab379de3b29f1c --- tools/db_stress.cc | 171 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 6a3e8bdefb1..813f8068278 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -333,6 +333,11 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter" DEFINE_string(db, "", "Use the db with the following name."); +DEFINE_string(secondaries_base, "", + "Use this path as the base path for secondary instances."); + +DEFINE_bool(enable_secondary, false, "Enable secondary instance."); + DEFINE_string( expected_values_path, "", "File where the array of expected uint32_t values will be stored. If " @@ -599,6 +604,13 @@ DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file"); DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable"); +DEFINE_int32(secondary_catch_up_one_in, 0, + "If non-zero, the secondaries attemp to catch up with the primary " + "once for every N operations on average. 0 indicates the " + "secondaries do not try to catch up after open."); + +static std::shared_ptr dbstats_secondaries; + enum RepFactory { kSkipList, kHashSkipList, @@ -1423,6 +1435,17 @@ class StressTest { } column_families_.clear(); delete db_; + + assert(secondaries_.size() == secondary_cfh_lists_.size()); + size_t n = secondaries_.size(); + for (size_t i = 0; i != n; ++i) { + for (auto* cf : secondary_cfh_lists_[i]) { + delete cf; + } + secondary_cfh_lists_[i].clear(); + delete secondaries_[i]; + } + secondaries_.clear(); } std::shared_ptr NewCache(size_t capacity) { @@ -1620,6 +1643,60 @@ class StressTest { } } +#ifndef ROCKSDB_LITE + if (FLAGS_enable_secondary) { + now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Start to verify secondaries against primary\n", + FLAGS_env->TimeToString(static_cast(now) / 1000000) + .c_str()); + } + for (size_t k = 0; k != secondaries_.size(); ++k) { + Status s = secondaries_[k]->TryCatchUpWithPrimary(); + if (!s.ok()) { + fprintf(stderr, "Secondary failed to catch up with primary\n"); + return false; + } + ReadOptions ropts; + ropts.total_order_seek = true; + // Verify only the default column family since the primary may have + // dropped other column families after most recent reopen. + std::unique_ptr iter1(db_->NewIterator(ropts)); + std::unique_ptr iter2(secondaries_[k]->NewIterator(ropts)); + for (iter1->SeekToFirst(), iter2->SeekToFirst(); + iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { + if (iter1->key().compare(iter2->key()) != 0 || + iter1->value().compare(iter2->value())) { + fprintf(stderr, + "Secondary %d contains different data from " + "primary.\nPrimary: %s : %s\nSecondary: %s : %s\n", + static_cast(k), + iter1->key().ToString(/*hex=*/true).c_str(), + iter1->value().ToString(/*hex=*/true).c_str(), + iter2->key().ToString(/*hex=*/true).c_str(), + iter2->value().ToString(/*hex=*/true).c_str()); + return false; + } + } + if (iter1->Valid() && !iter2->Valid()) { + fprintf(stderr, + "Secondary %d record count is smaller than that of primary\n", + static_cast(k)); + return false; + } else if (!iter1->Valid() && iter2->Valid()) { + fprintf(stderr, + "Secondary %d record count is larger than that of primary\n", + static_cast(k)); + return false; + } + } + if (FLAGS_enable_secondary) { + now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Verification of secondaries succeeded\n", + FLAGS_env->TimeToString(static_cast(now) / 1000000) + .c_str()); + } +#endif // ROCKSDB_LITE + if (shared.HasVerificationFailedYet()) { printf("Verification failed :(\n"); return false; @@ -2231,6 +2308,19 @@ class StressTest { TestIterate(thread, read_opts, rand_column_families, rand_keys); } thread->stats.FinishedSingleOp(); +#ifndef ROCKSDB_LITE + uint32_t tid = thread->tid; + assert(secondaries_.empty() || + static_cast(tid) < secondaries_.size()); + if (FLAGS_secondary_catch_up_one_in > 0 && + thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) { + Status s = secondaries_[tid]->TryCatchUpWithPrimary(); + if (!s.ok()) { + VerificationAbort(shared, "Secondary instance failed to catch up", s); + break; + } + } +#endif } thread->stats.Stop(); @@ -2864,11 +2954,52 @@ class StressTest { } assert(!s.ok() || column_families_.size() == static_cast(FLAGS_column_families)); + + if (FLAGS_enable_secondary) { +#ifndef ROCKSDB_LITE + secondaries_.resize(FLAGS_threads); + std::fill(secondaries_.begin(), secondaries_.end(), nullptr); + secondary_cfh_lists_.clear(); + secondary_cfh_lists_.resize(FLAGS_threads); + Options tmp_opts; + tmp_opts.max_open_files = FLAGS_open_files; + tmp_opts.statistics = dbstats_secondaries; + tmp_opts.env = FLAGS_env; + for (size_t i = 0; i != static_cast(FLAGS_threads); ++i) { + const std::string secondary_path = + FLAGS_secondaries_base + "/" + std::to_string(i); + s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path, + cf_descriptors, &secondary_cfh_lists_[i], + &secondaries_[i]); + if (!s.ok()) { + break; + } + } +#else + fprintf(stderr, "Secondary is not supported in RocksDBLite\n"); + exit(1); +#endif + } } else { #ifndef ROCKSDB_LITE DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl); db_ = db_with_ttl; + if (FLAGS_enable_secondary) { + secondaries_.resize(FLAGS_threads); + std::fill(secondaries_.begin(), secondaries_.end(), nullptr); + Options tmp_opts; + tmp_opts.max_open_files = FLAGS_open_files; + for (size_t i = 0; i != static_cast(FLAGS_threads); ++i) { + const std::string secondary_path = + FLAGS_secondaries_base + "/" + std::to_string(i); + s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path, + &secondaries_[i]); + if (!s.ok()) { + break; + } + } + } #else fprintf(stderr, "TTL is not supported in RocksDBLite\n"); exit(1); @@ -2891,6 +3022,17 @@ class StressTest { txn_db_ = nullptr; #endif + assert(secondaries_.size() == secondary_cfh_lists_.size()); + size_t n = secondaries_.size(); + for (size_t i = 0; i != n; ++i) { + for (auto* cf : secondary_cfh_lists_[i]) { + delete cf; + } + secondary_cfh_lists_[i].clear(); + delete secondaries_[i]; + } + secondaries_.clear(); + num_times_reopened_++; auto now = FLAGS_env->NowMicros(); fprintf(stdout, "%s Reopening database for the %dth time\n", @@ -2903,6 +3045,10 @@ class StressTest { if (dbstats) { fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); } + if (dbstats_secondaries) { + fprintf(stdout, "Secondary instances STATISTICS:\n%s\n", + dbstats_secondaries->ToString().c_str()); + } } std::shared_ptr cache_; @@ -2920,6 +3066,10 @@ class StressTest { std::unordered_map> options_table_; std::vector options_index_; std::atomic db_preload_finished_; + + // Fields used for stress-testing secondary instance in the same process + std::vector secondaries_; + std::vector > secondary_cfh_lists_; }; class NonBatchedOpsStressTest : public StressTest { @@ -4153,6 +4303,9 @@ int main(int argc, char** argv) { if (FLAGS_statistics) { dbstats = rocksdb::CreateDBStatistics(); + if (FLAGS_enable_secondary) { + dbstats_secondaries = rocksdb::CreateDBStatistics(); + } } FLAGS_compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str()); @@ -4261,6 +4414,24 @@ int main(int argc, char** argv) { FLAGS_db = default_db_path; } + if (FLAGS_enable_secondary && FLAGS_secondaries_base.empty()) { + std::string default_secondaries_path; + FLAGS_env->GetTestDirectory(&default_secondaries_path); + default_secondaries_path += "/dbstress_secondaries"; + rocksdb::Status s = FLAGS_env->CreateDirIfMissing(default_secondaries_path); + if (!s.ok()) { + fprintf(stderr, "Failed to create directory %s: %s\n", + default_secondaries_path.c_str(), s.ToString().c_str()); + exit(1); + } + FLAGS_secondaries_base = default_secondaries_path; + } + + if (!FLAGS_enable_secondary && FLAGS_secondary_catch_up_one_in > 0) { + fprintf(stderr, "Secondary instance is disabled.\n"); + exit(1); + } + rocksdb_kill_odds = FLAGS_kill_random_test; rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist); From 3886dddc3b44bf5061c0f93eab578c51e8bad7bd Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Mon, 1 Jul 2019 11:53:25 -0700 Subject: [PATCH 190/572] force flushing stats CF to avoid holding old logs (#5509) Summary: WAL records RocksDB writes to all column families. When user flushes a a column family, the old WAL will not accept new writes but cannot be deleted yet because it may still contain live data for other column families. (See https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log#life-cycle-of-a-wal for detailed explanation) Because of this, if there is a column family that receive very infrequent writes and no manual flush is called for it, it could prevent a lot of WALs from being deleted. PR https://github.com/facebook/rocksdb/pull/5046 introduced persistent stats column family which is a good example of such column families. Depending on the config, it may have long intervals between writes, and user is unaware of it which makes it difficult to call manual flush for it. This PR addresses the problem for persistent stats column family by forcing a flush for persistent stats column family when 1) another column family is flushed 2) persistent stats column family's log number is the smallest among all column families, this way persistent stats column family will keep advancing its log number when necessary, allowing RocksDB to delete old WAL files. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5509 Differential Revision: D16045896 Pulled By: miasantreble fbshipit-source-id: 286837b633e988417f0096ff38384742d3b40ef4 --- db/db_impl/db_impl.h | 2 + db/db_impl/db_impl_compaction_flush.cc | 28 +++++++++- db/db_impl/db_impl_write.cc | 37 +++++++++++++ monitoring/stats_history_test.cc | 77 +++++++++++++++++++++++++- 4 files changed, 142 insertions(+), 2 deletions(-) diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index b5437c49543..e57768a74af 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1292,6 +1292,8 @@ class DBImpl : public DB { Status ScheduleFlushes(WriteContext* context); + void MaybeFlushStatsCF(autovector* cfds); + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); void SelectColumnFamiliesForAtomicFlush(autovector* cfds); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 8cb37484cac..ff03e591d28 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1551,13 +1551,39 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { s = SwitchMemtable(cfd, &context); } - if (s.ok()) { if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { flush_memtable_id = cfd->imm()->GetLatestMemTableID(); flush_req.emplace_back(cfd, flush_memtable_id); } + if (immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd_stats, flush_memtable_id); + } + } + } } if (s.ok() && !flush_req.empty()) { diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 21b123c3a94..c0d320013b7 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1228,6 +1228,7 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { cfds.push_back(cfd); } } + MaybeFlushStatsCF(&cfds); } for (const auto cfd : cfds) { cfd->Ref(); @@ -1294,6 +1295,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { if (cfd_picked != nullptr) { cfds.push_back(cfd_picked); } + MaybeFlushStatsCF(&cfds); } for (const auto cfd : cfds) { @@ -1437,6 +1439,40 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, return Status::OK(); } +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + Status DBImpl::ScheduleFlushes(WriteContext* context) { autovector cfds; if (immutable_db_options_.atomic_flush) { @@ -1450,6 +1486,7 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { cfds.push_back(tmp_cfd); } + MaybeFlushStatsCF(&cfds); } Status status; for (auto& cfd : cfds) { diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index 16681fe05d8..bef928558d7 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -561,7 +561,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { Close(); // Reopen and flush memtable. - Reopen(options); + ASSERT_OK(TryReopen(options)); Flush(); Close(); // Now check keys in read only mode. @@ -569,6 +569,81 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { } #endif // !ROCKSDB_LITE +TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ColumnFamilyData* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + ColumnFamilyData* cfd_stats = static_cast( + dbfull()->PersistentStatsColumnFamily()) + ->cfd(); + ColumnFamilyData* cfd_test = + static_cast(handles_[1])->cfd(); + + ASSERT_OK(Put("foo", "v0")); + ASSERT_OK(Put("bar", "v0")); + ASSERT_EQ("v0", Get("bar")); + ASSERT_EQ("v0", Get("foo")); + ASSERT_OK(Put(1, "Eevee", "v0")); + ASSERT_EQ("v0", Get(1, "Eevee")); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + // writing to all three cf, flush default cf + // LogNumbers: default: 14, stats: 4, pikachu: 4 + ASSERT_OK(Flush()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo1", "v1")); + ASSERT_OK(Put("bar1", "v1")); + ASSERT_EQ("v1", Get("bar1")); + ASSERT_EQ("v1", Get("foo1")); + ASSERT_OK(Put(1, "Vaporeon", "v1")); + ASSERT_EQ("v1", Get(1, "Vaporeon")); + // writing to default and test cf, flush test cf + // LogNumbers: default: 14, stats: 16, pikachu: 16 + ASSERT_OK(Flush(1)); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_GT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo2", "v2")); + ASSERT_OK(Put("bar2", "v2")); + ASSERT_EQ("v2", Get("bar2")); + ASSERT_EQ("v2", Get("foo2")); + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + // writing to default and stats cf, flushing default cf + // LogNumbers: default: 19, stats: 19, pikachu: 19 + ASSERT_OK(Flush()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo3", "v3")); + ASSERT_OK(Put("bar3", "v3")); + ASSERT_EQ("v3", Get("bar3")); + ASSERT_EQ("v3", Get("foo3")); + ASSERT_OK(Put(1, "Jolteon", "v3")); + ASSERT_EQ("v3", Get(1, "Jolteon")); + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + // writing to all three cf, flushing test cf + // LogNumbers: default: 19, stats: 19, pikachu: 22 + ASSERT_OK(Flush(1)); + ASSERT_LT(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + Close(); +} + } // namespace rocksdb int main(int argc, char** argv) { From 9f0bd568897288952329e05bf2354cb21602cd6d Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 1 Jul 2019 12:43:14 -0700 Subject: [PATCH 191/572] Cache simulator: Refactor the cache simulator so that we can add alternative policies easily (#5517) Summary: This PR creates cache_simulator.h file. It contains a CacheSimulator that runs against a block cache trace record. We can add alternative cache simulators derived from CacheSimulator later. For example, this PR adds a PrioritizedCacheSimulator that inserts filter/index/uncompressed dictionary blocks with high priority. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5517 Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32 Differential Revision: D16043689 Pulled By: HaoyuHuang fbshipit-source-id: 65f28ed52b866ffb0e6eceffd7f9ca7c45bb680d --- CMakeLists.txt | 1 + TARGETS | 1 + src.mk | 1 + tools/block_cache_trace_analyzer.cc | 67 +++--------- tools/block_cache_trace_analyzer.h | 49 +-------- utilities/simulator_cache/cache_simulator.cc | 104 +++++++++++++++++++ utilities/simulator_cache/cache_simulator.h | 98 +++++++++++++++++ 7 files changed, 219 insertions(+), 102 deletions(-) create mode 100644 utilities/simulator_cache/cache_simulator.cc create mode 100644 utilities/simulator_cache/cache_simulator.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ff61dca99f..0ca338bd63f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -685,6 +685,7 @@ set(SOURCES utilities/persistent_cache/block_cache_tier_metadata.cc utilities/persistent_cache/persistent_cache_tier.cc utilities/persistent_cache/volatile_tier_impl.cc + utilities/simulator_cache/cache_simulator.cc utilities/simulator_cache/sim_cache.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/trace/file_trace_reader_writer.cc diff --git a/TARGETS b/TARGETS index a43ed6b1085..3935f1f740d 100644 --- a/TARGETS +++ b/TARGETS @@ -280,6 +280,7 @@ cpp_library( "utilities/persistent_cache/block_cache_tier_metadata.cc", "utilities/persistent_cache/persistent_cache_tier.cc", "utilities/persistent_cache/volatile_tier_impl.cc", + "utilities/simulator_cache/cache_simulator.cc", "utilities/simulator_cache/sim_cache.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", "utilities/trace/file_trace_reader_writer.cc", diff --git a/src.mk b/src.mk index 71c2bd01803..7c35ee67589 100644 --- a/src.mk +++ b/src.mk @@ -199,6 +199,7 @@ LIB_SOURCES = \ utilities/persistent_cache/block_cache_tier_metadata.cc \ utilities/persistent_cache/persistent_cache_tier.cc \ utilities/persistent_cache/volatile_tier_impl.cc \ + utilities/simulator_cache/cache_simulator.cc \ utilities/simulator_cache/sim_cache.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/trace/file_trace_reader_writer.cc \ diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 78753a21622..4770348a79d 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -24,7 +24,7 @@ DEFINE_string( "The config file path. One cache configuration per line. The format of a " "cache configuration is " "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. " - "cache_name is lru. cache_capacity can be xK, xM or xG " + "cache_name is lru or lru_priority. cache_capacity can be xK, xM or xG " "where x is a positive number."); DEFINE_int32(block_cache_trace_downsample_ratio, 1, "The trace collected accesses on one in every " @@ -179,47 +179,6 @@ double percent(uint64_t numerator, uint64_t denomenator) { } // namespace -BlockCacheTraceSimulator::BlockCacheTraceSimulator( - uint64_t warmup_seconds, uint32_t downsample_ratio, - const std::vector& cache_configurations) - : warmup_seconds_(warmup_seconds), - downsample_ratio_(downsample_ratio), - cache_configurations_(cache_configurations) { - for (auto const& config : cache_configurations_) { - for (auto cache_capacity : config.cache_capacities) { - // Scale down the cache capacity since the trace contains accesses on - // 1/'downsample_ratio' blocks. - uint64_t simulate_cache_capacity = - cache_capacity / downsample_ratio_; - sim_caches_.push_back(NewSimCache( - NewLRUCache(simulate_cache_capacity, config.num_shard_bits), - /*real_cache=*/nullptr, config.num_shard_bits)); - } - } -} - -void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { - if (trace_start_time_ == 0) { - trace_start_time_ = access.access_timestamp; - } - // access.access_timestamp is in microseconds. - if (!warmup_complete_ && - trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= - access.access_timestamp) { - for (auto& sim_cache : sim_caches_) { - sim_cache->reset_counter(); - } - warmup_complete_ = true; - } - for (auto& sim_cache : sim_caches_) { - auto handle = sim_cache->Lookup(access.block_key); - if (handle == nullptr && !access.no_insert) { - sim_cache->Insert(access.block_key, /*value=*/nullptr, access.block_size, - /*deleter=*/nullptr); - } - } -} - void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { if (!cache_simulator_) { return; @@ -237,27 +196,21 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { const std::string header = "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"; out << header << std::endl; - uint64_t sim_cache_index = 0; - for (auto const& config : cache_simulator_->cache_configurations()) { - for (auto cache_capacity : config.cache_capacities) { - uint64_t hits = - cache_simulator_->sim_caches()[sim_cache_index]->get_hit_counter(); - uint64_t misses = - cache_simulator_->sim_caches()[sim_cache_index]->get_miss_counter(); - uint64_t total_accesses = hits + misses; - double miss_ratio = static_cast(misses * 100.0 / total_accesses); + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + double miss_ratio = config_caches.second[i]->miss_ratio(); // Write the body. out << config.cache_name; out << ","; out << config.num_shard_bits; out << ","; - out << cache_capacity; + out << config.cache_capacities[i]; out << ","; out << std::fixed << std::setprecision(4) << miss_ratio; out << ","; - out << total_accesses; + out << config_caches.second[i]->total_accesses(); out << std::endl; - sim_cache_index++; } } out.close(); @@ -1095,6 +1048,12 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { if (!cache_configs.empty()) { cache_simulator.reset(new BlockCacheTraceSimulator( warmup_seconds, downsample_ratio, cache_configs)); + Status s = cache_simulator->InitializeCaches(); + if (!s.ok()) { + fprintf(stderr, "Cannot initialize cache simulators %s\n", + s.ToString().c_str()); + exit(1); + } } BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir, diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 21a99f7db76..617b90280c9 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -12,57 +12,10 @@ #include "rocksdb/env.h" #include "rocksdb/utilities/sim_cache.h" #include "trace_replay/block_cache_tracer.h" +#include "utilities/simulator_cache/cache_simulator.h" namespace rocksdb { -const uint64_t kMicrosInSecond = 1000000; - -class BlockCacheTraceAnalyzer; - -// A cache configuration provided by user. -struct CacheConfiguration { - std::string cache_name; // LRU. - uint32_t num_shard_bits; - std::vector - cache_capacities; // simulate cache capacities in bytes. -}; - -// A block cache simulator that reports miss ratio curves given a set of cache -// configurations. -class BlockCacheTraceSimulator { - public: - // warmup_seconds: The number of seconds to warmup simulated caches. The - // hit/miss counters are reset after the warmup completes. - BlockCacheTraceSimulator( - uint64_t warmup_seconds, uint32_t downsample_ratio, - const std::vector& cache_configurations); - ~BlockCacheTraceSimulator() = default; - // No copy and move. - BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete; - BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete; - BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete; - BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete; - - void Access(const BlockCacheTraceRecord& access); - - const std::vector>& sim_caches() const { - return sim_caches_; - } - - const std::vector& cache_configurations() const { - return cache_configurations_; - } - - private: - const uint64_t warmup_seconds_; - const uint32_t downsample_ratio_; - const std::vector cache_configurations_; - - bool warmup_complete_ = false; - std::vector> sim_caches_; - uint64_t trace_start_time_ = 0; -}; - // Statistics of a block. struct BlockAccessInfo { uint64_t num_accesses = 0; diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc new file mode 100644 index 00000000000..145efdb6cba --- /dev/null +++ b/utilities/simulator_cache/cache_simulator.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/simulator_cache/cache_simulator.h" + +namespace rocksdb { +CacheSimulator::CacheSimulator(std::shared_ptr sim_cache) + : sim_cache_(sim_cache) {} + +void CacheSimulator::Access(const BlockCacheTraceRecord& access) { + auto handle = sim_cache_->Lookup(access.block_key); + if (handle == nullptr && !access.no_insert) { + sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, + /*deleter=*/nullptr, /*handle=*/nullptr); + } +} + +void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { + auto handle = sim_cache_->Lookup(access.block_key); + if (handle == nullptr && !access.no_insert) { + Cache::Priority priority = Cache::Priority::LOW; + if (access.block_type == TraceType::kBlockTraceFilterBlock || + access.block_type == TraceType::kBlockTraceIndexBlock || + access.block_type == TraceType::kBlockTraceUncompressionDictBlock) { + priority = Cache::Priority::HIGH; + } + sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, + /*deleter=*/nullptr, /*handle=*/nullptr, priority); + } +} + +double CacheSimulator::miss_ratio() { + uint64_t hits = sim_cache_->get_hit_counter(); + uint64_t misses = sim_cache_->get_miss_counter(); + uint64_t total_accesses = hits + misses; + return static_cast(misses * 100.0 / total_accesses); +} + +uint64_t CacheSimulator::total_accesses() { + return sim_cache_->get_hit_counter() + sim_cache_->get_miss_counter(); +} + +BlockCacheTraceSimulator::BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations) + : warmup_seconds_(warmup_seconds), + downsample_ratio_(downsample_ratio), + cache_configurations_(cache_configurations) {} + +Status BlockCacheTraceSimulator::InitializeCaches() { + for (auto const& config : cache_configurations_) { + for (auto cache_capacity : config.cache_capacities) { + // Scale down the cache capacity since the trace contains accesses on + // 1/'downsample_ratio' blocks. + uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_; + std::shared_ptr sim_cache; + if (config.cache_name == "lru") { + sim_cache = std::make_shared(NewSimCache( + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0), + /*real_cache=*/nullptr, config.num_shard_bits)); + } else if (config.cache_name == "lru_priority") { + sim_cache = std::make_shared(NewSimCache( + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*real_cache=*/nullptr, config.num_shard_bits)); + } else { + // Not supported. + return Status::InvalidArgument("Unknown cache name " + + config.cache_name); + } + sim_caches_[config].push_back(sim_cache); + } + } + return Status::OK(); +} + +void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { + if (trace_start_time_ == 0) { + trace_start_time_ = access.access_timestamp; + } + // access.access_timestamp is in microseconds. + if (!warmup_complete_ && + trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= + access.access_timestamp) { + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->reset_counter(); + } + } + warmup_complete_ = true; + } + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->Access(access); + } + } +} + +} // namespace rocksdb diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h new file mode 100644 index 00000000000..37166d8a9c4 --- /dev/null +++ b/utilities/simulator_cache/cache_simulator.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/utilities/sim_cache.h" +#include "trace_replay/block_cache_tracer.h" + +namespace rocksdb { + +const uint64_t kMicrosInSecond = 1000000; + +// A cache configuration provided by user. +struct CacheConfiguration { + std::string cache_name; // LRU. + uint32_t num_shard_bits; + std::vector + cache_capacities; // simulate cache capacities in bytes. + + bool operator=(const CacheConfiguration& o) const { + return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits; + } + bool operator<(const CacheConfiguration& o) const { + return cache_name < o.cache_name || + (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits); + } +}; + +// A cache simulator that runs against a block cache trace. +class CacheSimulator { + public: + CacheSimulator(std::shared_ptr sim_cache); + virtual ~CacheSimulator() = default; + // No copy and move. + CacheSimulator(const CacheSimulator&) = delete; + CacheSimulator& operator=(const CacheSimulator&) = delete; + CacheSimulator(CacheSimulator&&) = delete; + CacheSimulator& operator=(CacheSimulator&&) = delete; + + virtual void Access(const BlockCacheTraceRecord& access); + void reset_counter() { sim_cache_->reset_counter(); } + double miss_ratio(); + uint64_t total_accesses(); + + protected: + std::shared_ptr sim_cache_; +}; + +// A prioritized cache simulator that runs against a block cache trace. +// It inserts missing index/filter/uncompression-dictionary blocks with high +// priority in the cache. +class PrioritizedCacheSimulator : public CacheSimulator { + public: + PrioritizedCacheSimulator(std::shared_ptr sim_cache) + : CacheSimulator(sim_cache) {} + void Access(const BlockCacheTraceRecord& access) override; +}; + +// A block cache simulator that reports miss ratio curves given a set of cache +// configurations. +class BlockCacheTraceSimulator { + public: + // warmup_seconds: The number of seconds to warmup simulated caches. The + // hit/miss counters are reset after the warmup completes. + BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations); + ~BlockCacheTraceSimulator() = default; + // No copy and move. + BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete; + BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete; + + Status InitializeCaches(); + + void Access(const BlockCacheTraceRecord& access); + + const std::map>>& + sim_caches() const { + return sim_caches_; + } + + private: + const uint64_t warmup_seconds_; + const uint32_t downsample_ratio_; + const std::vector cache_configurations_; + + bool warmup_complete_ = false; + std::map>> + sim_caches_; + uint64_t trace_start_time_ = 0; +}; + +} // namespace rocksdb From f872009237762abb504c32c781a5b337033f401c Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Mon, 1 Jul 2019 13:02:30 -0700 Subject: [PATCH 192/572] Fix from some C-style casting (#5524) Summary: Fix from some C-style casting in bloom.cc and ./tools/db_bench_tool.cc Pull Request resolved: https://github.com/facebook/rocksdb/pull/5524 Differential Revision: D16075626 Pulled By: elipoz fbshipit-source-id: 352948885efb64a7ef865942c75c3c727a914207 --- tools/db_bench_tool.cc | 29 ++++++++++++++++++----------- util/bloom.cc | 6 +++--- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 9b3e2cac35f..cb5b5a38a66 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2397,16 +2397,19 @@ class Benchmark { return nullptr; } if (FLAGS_use_clock_cache) { - auto cache = NewClockCache((size_t)capacity, FLAGS_cache_numshardbits); + auto cache = + NewClockCache(static_cast(capacity), FLAGS_cache_numshardbits); if (!cache) { fprintf(stderr, "Clock cache not supported."); exit(1); } return cache; } else { - return NewLRUCache((size_t)capacity, FLAGS_cache_numshardbits, - false /*strict_capacity_limit*/, - FLAGS_cache_high_pri_pool_ratio); + return NewLRUCache( + static_cast(capacity), + FLAGS_cache_numshardbits, + false /*strict_capacity_limit*/, + FLAGS_cache_high_pri_pool_ratio); } } @@ -3604,9 +3607,12 @@ class Benchmark { } if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != - (unsigned int)FLAGS_num_levels) { - fprintf(stderr, "Insufficient number of fanouts specified %d\n", - (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size()); + static_cast(FLAGS_num_levels)) { + fprintf( + stderr, + "Insufficient number of fanouts specified %d\n", + static_cast( + FLAGS_max_bytes_for_level_multiplier_additional_v.size())); exit(1); } options.max_bytes_for_level_multiplier_additional = @@ -4791,7 +4797,7 @@ class Benchmark { if (FLAGS_multiread_stride) { int64_t key = GetRandomKey(&thread->rand); if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >= - (int64_t)FLAGS_num) { + static_cast(FLAGS_num)) { key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride; } for (int64_t i = 0; i < entries_per_batch_; ++i) { @@ -5161,9 +5167,10 @@ class Benchmark { FLAGS_num, &lower_bound); options.iterate_lower_bound = &lower_bound; } else { + auto min_num = + std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); GenerateKeyFromInt( - (uint64_t)std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance), - FLAGS_num, &upper_bound); + static_cast(min_num), FLAGS_num, &upper_bound); options.iterate_upper_bound = &upper_bound; } } @@ -5331,7 +5338,7 @@ class Benchmark { // Wait for the writes to be finished if (!hint_printed) { fprintf(stderr, "Reads are finished. Have %d more writes to do\n", - (int)writes_ - written); + static_cast(writes_) - written); hint_printed = true; } } else { diff --git a/util/bloom.cc b/util/bloom.cc index 953a42fa213..f859ab7dd64 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -104,7 +104,7 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) { assert(bits_per_key_); assert(space > 0); uint32_t dont_care1, dont_care2; - int high = (int) (space * 8 / bits_per_key_ + 1); + int high = static_cast(space * 8 / bits_per_key_ + 1); int low = 1; int n = high; for (; n >= low; n--) { @@ -120,7 +120,7 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) { inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits) { #ifdef NDEBUG - (void)total_bits; + static_cast(total_bits); #endif assert(num_lines > 0 && total_bits > 0); @@ -340,7 +340,7 @@ class BloomFilterPolicy : public FilterPolicy { dst->resize(init_size + bytes, 0); dst->push_back(static_cast(num_probes_)); // Remember # of probes char* array = &(*dst)[init_size]; - for (size_t i = 0; i < (size_t)n; i++) { + for (size_t i = 0; i < static_cast(n); i++) { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = hash_func_(keys[i]); From 1e87f2b68b01db0579fb98491114e8f059f680be Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 1 Jul 2019 14:04:10 -0700 Subject: [PATCH 193/572] Ref and unref cfd before and after calling WaitForFlushMemTables (#5513) Summary: This is to prevent bg flush thread from unrefing and deleting the cfd that has been dropped by a concurrent thread. Before RocksDB calls `DBImpl::WaitForFlushMemTables`, we should increase the refcount of each `ColumnFamilyData` so that its ref count will not drop to 0 even if the column family is dropped by another thread. Otherwise the bg flush thread can deref the cfd and deletes it, causing a segfault in `WaitForFlushMemtables` upon accessing `cfd`. Test plan (on devserver): ``` $make clean && COMPILE_WITH_ASAN=1 make -j32 $make check ``` All unit tests must pass. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5513 Differential Revision: D16062898 Pulled By: riversand963 fbshipit-source-id: 37dc511f1dc99f036d0201bbd7f0a8f5677c763d --- db/db_flush_test.cc | 76 ++++++++++++++++++++++++++ db/db_impl/db_impl.h | 10 ++++ db/db_impl/db_impl_compaction_flush.cc | 39 ++++++++++++- db/db_impl/db_impl_debug.cc | 10 ++++ 4 files changed, 133 insertions(+), 2 deletions(-) diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index b901a5a7805..034ec63226c 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -290,6 +290,39 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { Close(); } +TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) { + Options options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:AfterScheduleFlush", + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_.resize(1); + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -545,6 +578,49 @@ TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) { handles_.clear(); } +TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(0, "key", "value")); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + auto* cfd_pikachu = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + delete handles_[1]; + handles_.resize(1); + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu}, + flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index e57768a74af..737f2337608 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -788,6 +788,16 @@ class DBImpl : public DB { Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, ColumnFamilyHandle* cfh = nullptr); + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables(const autovector& cfds, + const FlushOptions& flush_opts); + // Wait for memtable compaction Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index ff03e591d28..67292401683 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1591,6 +1591,16 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, ColumnFamilyData* loop_cfd = elem.first; loop_cfd->imm()->FlushRequested(); } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->Ref(); + } + } SchedulePendingFlush(flush_req, flush_reason); MaybeScheduleFlushOrCompaction(); } @@ -1599,7 +1609,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, write_thread_.ExitUnbatched(&w); } } - + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); if (s.ok() && flush_options.wait) { autovector cfds; autovector flush_memtable_ids; @@ -1609,6 +1620,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } s = WaitForFlushMemTables(cfds, flush_memtable_ids, (flush_reason == FlushReason::kErrorRecovery)); + for (auto* tmp_cfd : cfds) { + if (tmp_cfd->Unref()) { + // Only one thread can reach here. + InstrumentedMutexLock lock_guard(&mutex_); + delete tmp_cfd; + } + } } TEST_SYNC_POINT("FlushMemTableFinished"); return s; @@ -1672,6 +1690,15 @@ Status DBImpl::AtomicFlushMemTables( for (auto cfd : cfds) { cfd->imm()->FlushRequested(); } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } GenerateFlushRequest(cfds, &flush_req); SchedulePendingFlush(flush_req, flush_reason); MaybeScheduleFlushOrCompaction(); @@ -1682,7 +1709,7 @@ Status DBImpl::AtomicFlushMemTables( } } TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); - + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); if (s.ok() && flush_options.wait) { autovector flush_memtable_ids; for (auto& iter : flush_req) { @@ -1690,6 +1717,13 @@ Status DBImpl::AtomicFlushMemTables( } s = WaitForFlushMemTables(cfds, flush_memtable_ids, (flush_reason == FlushReason::kErrorRecovery)); + for (auto* cfd : cfds) { + if (cfd->Unref()) { + // Only one thread can reach here. + InstrumentedMutexLock lock_guard(&mutex_); + delete cfd; + } + } } return s; } @@ -2151,6 +2185,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, } status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); // All the CFDs in the FlushReq must have the same flush reason, so just // grab the first one *reason = bg_flush_args[0].cfd_->GetFlushReason(); diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index ec1e1b47752..ec8489848c5 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -122,6 +122,16 @@ Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, return FlushMemTable(cfd, fo, FlushReason::kTest); } +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& cfds, const FlushOptions& flush_opts) { + return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); +} + Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { From 3e6c18538130a4fafb491a5a45dc614958cfe50b Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Mon, 1 Jul 2019 14:53:51 -0700 Subject: [PATCH 194/572] Formatting fixes in db_bench_tool (#5525) Summary: Formatting fixes in db_bench_tool that were accidentally omitted Pull Request resolved: https://github.com/facebook/rocksdb/pull/5525 Test Plan: Unit tests Differential Revision: D16078516 Pulled By: elipoz fbshipit-source-id: bf8df0e3f08092a91794ebf285396d9b8a335bb9 --- tools/db_bench_tool.cc | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index cb5b5a38a66..8344669b75c 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2397,8 +2397,8 @@ class Benchmark { return nullptr; } if (FLAGS_use_clock_cache) { - auto cache = - NewClockCache(static_cast(capacity), FLAGS_cache_numshardbits); + auto cache = NewClockCache(static_cast(capacity), + FLAGS_cache_numshardbits); if (!cache) { fprintf(stderr, "Clock cache not supported."); exit(1); @@ -2406,10 +2406,8 @@ class Benchmark { return cache; } else { return NewLRUCache( - static_cast(capacity), - FLAGS_cache_numshardbits, - false /*strict_capacity_limit*/, - FLAGS_cache_high_pri_pool_ratio); + static_cast(capacity), FLAGS_cache_numshardbits, + false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio); } } @@ -3608,11 +3606,9 @@ class Benchmark { if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != static_cast(FLAGS_num_levels)) { - fprintf( - stderr, - "Insufficient number of fanouts specified %d\n", - static_cast( - FLAGS_max_bytes_for_level_multiplier_additional_v.size())); + fprintf(stderr, "Insufficient number of fanouts specified %d\n", + static_cast( + FLAGS_max_bytes_for_level_multiplier_additional_v.size())); exit(1); } options.max_bytes_for_level_multiplier_additional = @@ -5168,9 +5164,9 @@ class Benchmark { options.iterate_lower_bound = &lower_bound; } else { auto min_num = - std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); - GenerateKeyFromInt( - static_cast(min_num), FLAGS_num, &upper_bound); + std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); + GenerateKeyFromInt(static_cast(min_num), FLAGS_num, + &upper_bound); options.iterate_upper_bound = &upper_bound; } } From 66464d1fde0257af79b97018a2f6be554f41ff20 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 1 Jul 2019 15:11:43 -0700 Subject: [PATCH 195/572] Remove multiple declarations o kMicrosInSecond. Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5526 Test Plan: OPT=-g V=1 make J=1 unity_test -j32 make clean && make -j32 Differential Revision: D16079315 Pulled By: HaoyuHuang fbshipit-source-id: 294ab439cf0db8dd5da44e30eabf0cbb2bb8c4f6 --- db/db_impl/db_impl.cc | 1 - tools/block_cache_trace_analyzer_test.cc | 2 +- tools/db_bench_tool.cc | 1 - trace_replay/block_cache_tracer.cc | 1 + trace_replay/block_cache_tracer.h | 2 ++ utilities/simulator_cache/cache_simulator.h | 2 -- 6 files changed, 4 insertions(+), 5 deletions(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index e2de696ef57..55f89eab32e 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -101,7 +101,6 @@ namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); const std::string kPersistentStatsColumnFamilyName( "___rocksdb_stats_history___"); -const int kMicrosInSecond = 1000 * 1000; void DumpRocksDBBuildVersion(Logger* log); CompressionType GetCompressionFlush( diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index 80734565a3d..21d8bcbbb3f 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -91,7 +91,7 @@ class BlockCacheTracerTest : public testing::Test { assert(writer); for (uint32_t i = 0; i < nblocks; i++) { uint32_t key_id = from_key_id + i; - uint32_t timestamp = (key_id + 1) * kMicrosInSecond; + uint64_t timestamp = (key_id + 1) * kMicrosInSecond; BlockCacheTraceRecord record; record.block_type = block_type; record.block_size = kBlockSize + key_id; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 8344669b75c..abffae5d9e8 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1547,7 +1547,6 @@ class ReporterAgent { private: std::string Header() const { return "secs_elapsed,interval_qps"; } void SleepAndReport() { - uint64_t kMicrosInSecond = 1000 * 1000; auto time_started = env_->NowMicros(); while (true) { { diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index b163216d874..cc875bf0dcd 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -28,6 +28,7 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) { } } // namespace +const uint64_t kMicrosInSecond = 1000 * 1000; const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = "UnknownColumnFamily"; diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index e2ad933b9b8..e21111727c9 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -16,6 +16,8 @@ namespace rocksdb { +extern const uint64_t kMicrosInSecond; + // Lookup context for tracing block cache accesses. // We trace block accesses at five places: // 1. BlockBasedTable::GetFilter diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h index 37166d8a9c4..b391d5dc8a5 100644 --- a/utilities/simulator_cache/cache_simulator.h +++ b/utilities/simulator_cache/cache_simulator.h @@ -10,8 +10,6 @@ namespace rocksdb { -const uint64_t kMicrosInSecond = 1000000; - // A cache configuration provided by user. struct CacheConfiguration { std::string cache_name; // LRU. From cfdf2116d38cd39763528ce2f3a01e661700c601 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Mon, 1 Jul 2019 16:32:59 -0700 Subject: [PATCH 196/572] Exclude StatsHistoryTest.ForceManualFlushStatsCF test from lite mode (#5529) Summary: Recent commit 3886dddc3b44bf5061c0f93eab578c51e8bad7bd introduced a new test which is not compatible with lite mode and breaks contrun test: ``` [ RUN ] StatsHistoryTest.ForceManualFlushStatsCF monitoring/stats_history_test.cc:642: Failure Expected: (cfd_stats->GetLogNumber()) < (cfd_test->GetLogNumber()), actual: 15 vs 15 ``` This PR excludes the test from lite mode to appease the failing test Pull Request resolved: https://github.com/facebook/rocksdb/pull/5529 Differential Revision: D16080892 Pulled By: miasantreble fbshipit-source-id: 2f8a22758f71250cd9f204046404226ddc13b028 --- monitoring/stats_history_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index bef928558d7..9adacdbf7bc 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -567,7 +567,6 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); } -#endif // !ROCKSDB_LITE TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { Options options; @@ -644,6 +643,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { Close(); } +#endif // !ROCKSDB_LITE } // namespace rocksdb int main(int argc, char** argv) { From 662ce6204406f4377044e9fd34fb8dc502ca4df7 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 2 Jul 2019 11:45:32 -0700 Subject: [PATCH 197/572] Reduce iterator key comparison for upper/lower bound check (2nd attempt) (#5468) Summary: This is a second attempt for https://github.com/facebook/rocksdb/issues/5111, with the fix to redo iterate bounds check after `SeekXXX()`. This is because MyRocks may change iterate bounds between seek. See https://github.com/facebook/rocksdb/issues/5111 for original benchmark result and discussion. Closes https://github.com/facebook/rocksdb/issues/5463. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5468 Test Plan: Existing rocksdb tests, plus myrocks test `rocksdb.optimizer_loose_index_scans` and `rocksdb.group_min_max`. Differential Revision: D15863332 fbshipit-source-id: ab4aba5899838591806b8673899bd465f3f53e18 --- HISTORY.md | 1 + db/db_iter.cc | 9 ++- db/db_iterator_test.cc | 62 +++++++++++++++++++ db/version_set.cc | 48 +++++++++++--- table/block_based/block_based_table_reader.cc | 30 ++++++--- table/block_based/block_based_table_reader.h | 15 ++++- table/internal_iterator.h | 25 +++++++- table/iterator_wrapper.h | 22 +++++-- table/merging_iterator.cc | 24 +++++++ 9 files changed, 212 insertions(+), 24 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2c8dc8c3ab9..c3af6ba06d7 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -24,6 +24,7 @@ * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases +* Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel diff --git a/db/db_iter.cc b/db/db_iter.cc index b89d7301131..633724c5763 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,7 +467,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); - if (iterate_upper_bound_ != nullptr && + assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -859,7 +861,10 @@ void DBIter::PrevInternal() { return; } - if (iterate_lower_bound_ != nullptr && + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index d514e7683de..67a97b20b81 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2759,6 +2759,68 @@ TEST_P(DBIteratorTest, AvoidReseekChildIterator) { SyncPoint::GetInstance()->DisableProcessing(); } +// MyRocks may change iterate bounds before seek. Simply test to make sure such +// usage doesn't break iterator. +TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::string value(50, 'v'); + Reopen(options); + ASSERT_OK(Put("aaa", value)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("eee", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string ub1 = "e"; + std::string ub2 = "c"; + Slice ub(ub1); + ReadOptions read_opts1; + read_opts1.iterate_upper_bound = &ub; + Iterator* iter = NewIterator(read_opts1); + // Seek and iterate accross block boundary. + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + ub = Slice(ub2); + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + std::string lb1 = "a"; + std::string lb2 = "c"; + Slice lb(lb1); + ReadOptions read_opts2; + read_opts2.iterate_lower_bound = &lb; + iter = NewIterator(read_opts2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + lb = Slice(lb2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/version_set.cc b/db/version_set.cc index 8e2d21b051a..3354959aac4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } + Slice value() const override { assert(Valid()); return file_iter_.value(); } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } + bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } + bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); @@ -955,6 +970,7 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } + CheckMayBeOutOfLowerBound(); return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, @@ -963,6 +979,19 @@ class LevelIterator final : public InternalIterator { largest_compaction_key); } + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (Valid() && read_options_.iterate_lower_bound != nullptr) { + may_be_out_of_lower_bound_ = + user_comparator_.Compare( + ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; + } + } + TableCache* table_cache_; const ReadOptions read_options_; const EnvOptions& env_options_; @@ -976,6 +1005,7 @@ class LevelIterator final : public InternalIterator { bool should_sample_; TableReaderCaller caller_; bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1011,6 +1041,7 @@ void LevelIterator::Seek(const Slice& target) { file_iter_.Seek(target); } SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekForPrev(const Slice& target) { @@ -1024,6 +1055,7 @@ void LevelIterator::SeekForPrev(const Slice& target) { file_iter_.SeekForPrev(target); SkipEmptyFileBackward(); } + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToFirst() { @@ -1032,6 +1064,7 @@ void LevelIterator::SeekToFirst() { file_iter_.SeekToFirst(); } SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToLast() { @@ -1040,15 +1073,17 @@ void LevelIterator::SeekToLast() { file_iter_.SeekToLast(); } SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(Slice* ret_key) { +bool LevelIterator::NextAndGetResult(IterateResult* result) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -4366,10 +4401,9 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, - next_file_number_.load(), last_sequence_.load(), log_number, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index edddecf78bd..87756f2e240 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2896,6 +2896,7 @@ void BlockBasedTableIterator::SeekImpl( FindKeyForward(); } + CheckDataBlockWithinUpperBound(); CheckOutOfBound(); if (target) { @@ -2952,6 +2953,7 @@ void BlockBasedTableIterator::SeekForPrev( block_iter_.SeekForPrev(target); FindKeyBackward(); + CheckDataBlockWithinUpperBound(); assert(!block_iter_.Valid() || icomp_.Compare(target, block_iter_.key()) >= 0); } @@ -2969,6 +2971,7 @@ void BlockBasedTableIterator::SeekToLast() { InitDataBlock(); block_iter_.SeekToLast(); FindKeyBackward(); + CheckDataBlockWithinUpperBound(); } template @@ -2984,11 +2987,12 @@ void BlockBasedTableIterator::Next() { template bool BlockBasedTableIterator::NextAndGetResult( - Slice* ret_key) { + IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -3087,6 +3091,7 @@ void BlockBasedTableIterator::InitDataBlock() { /*for_compaction=*/lookup_context_.caller == TableReaderCaller::kCompaction); block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); } } @@ -3140,13 +3145,12 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = false; - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - next_block_is_out_of_bound = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + assert(!next_block_is_out_of_bound || + user_comparator_.Compare(*read_options_.iterate_upper_bound, index_iter_->user_key()) <= 0); - } ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { @@ -3210,6 +3214,16 @@ void BlockBasedTableIterator::CheckOutOfBound() { } } +template +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } +} + InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 358bc8b8d22..750700813d3 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -652,7 +652,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && @@ -702,6 +702,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } @@ -768,6 +773,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool block_iter_points_to_real_block_; // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; // True if we're standing at the first key of a block, and we haven't loaded // that block yet. A call to value() will trigger loading the block. bool is_at_first_key_from_index_ = false; @@ -802,6 +809,12 @@ class BlockBasedTableIterator : public InternalIteratorBase { void FindBlockForward(); void FindKeyBackward(); void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); }; } // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 696e66135dc..426ff396548 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,6 +17,11 @@ namespace rocksdb { class PinnedIteratorsManager; +struct IterateResult { + Slice key; + bool may_be_out_of_upper_bound; +}; + template class InternalIteratorBase : public Cleanable { public: @@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - virtual bool NextAndGetResult(Slice* ret_key) { + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when MayBeOutOfUpperBound() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // it should also override NextAndGetResult(). + result->may_be_out_of_upper_bound = true; + assert(MayBeOutOfUpperBound()); } return is_valid; } @@ -97,6 +111,13 @@ class InternalIteratorBase : public Cleanable { // keys above the upper bound, IsOutOfBound() must return false. virtual bool IsOutOfBound() { return false; } + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // Keys return from this iterator can be larger or equal to + // iterate_upper_bound. + virtual bool MayBeOutOfUpperBound() { return true; } + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a570e53c1e2..a5aa5c49eac 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,7 +56,10 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } + Slice key() const { + assert(Valid()); + return result_.key; + } TValue value() const { assert(Valid()); return iter_->value(); @@ -65,7 +68,7 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&key_); + valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } @@ -83,6 +86,16 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() { + assert(Valid()); + return result_.may_be_out_of_upper_bound; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -100,14 +113,15 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { - key_ = iter_->key(); assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; + IterateResult result_; bool valid_; - Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 207066b5a1e..1a0d4df8995 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; + } + void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator { return current_->value(); } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() override { + assert(Valid()); + return current_->MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) { From 0d57d93a06727943dbad0bc80768a29d74ce22a0 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 2 Jul 2019 12:03:40 -0700 Subject: [PATCH 198/572] Support jemalloc compiled with `--with-jemalloc-prefix` (#5521) Summary: Previously, if the jemalloc was built with nonempty string for `--with-jemalloc-prefix`, then `HasJemalloc()` would return false on Linux, so jemalloc would not be used at runtime. On Mac, it would cause a linker failure due to no definitions found for the weak functions declared in "port/jemalloc_helper.h". This should be a rare problem because (1) on Linux the default `--with-jemalloc-prefix` value is the empty string, and (2) Homebrew's build explicitly sets `--with-jemalloc-prefix` to the empty string. However, there are cases where `--with-jemalloc-prefix` is nonempty. For example, when building jemalloc from source on Mac, the default setting is `--with-jemalloc-prefix=je_`. Such jemalloc builds should be usable by RocksDB. The fix is simple. Defining `JEMALLOC_MANGLE` before including "jemalloc.h" causes it to define unprefixed symbols that are aliases for each of the prefixed symbols. Thanks to benesch for figuring this out and explaining it to me. Fixes https://github.com/facebook/rocksdb/issues/1462. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5521 Test Plan: build jemalloc with prefixed symbols: ``` $ ./configure --with-jemalloc-prefix=lol $ make ``` compile rocksdb against it: ``` $ WITH_JEMALLOC_FLAG=1 JEMALLOC=1 EXTRA_LDFLAGS="-L/home/andrew/jemalloc/lib/" EXTRA_CXXFLAGS="-I/home/andrew/jemalloc/include/" make -j12 ./db_bench ``` run db_bench and verify jemalloc actually used: ``` $ ./db_bench -benchmarks=fillrandom -statistics=true -dump_malloc_stats=true -stats_dump_period_sec=1 $ grep jemalloc /tmp/rocksdbtest-1000/dbbench/LOG 2019/06/29-12:20:52.088658 7fc5fb7f6700 [_impl/db_impl.cc:837] ___ Begin jemalloc statistics ___ ... ``` Differential Revision: D16092758 fbshipit-source-id: c2c358346190ed62ceb2a3547a6c4c180b12f7c4 --- db/malloc_stats.cc | 4 ---- port/jemalloc_helper.h | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index bcee5c3fbfe..1dfe0d55b43 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -20,10 +20,6 @@ namespace rocksdb { #ifdef ROCKSDB_JEMALLOC -#ifdef JEMALLOC_NO_RENAME -#define malloc_stats_print je_malloc_stats_print -#endif - typedef struct { char* cur; char* end; diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index 26e5fb66336..6aeb780ee6e 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -9,6 +9,7 @@ #ifdef __FreeBSD__ #include #else +#define JEMALLOC_MANGLE #include #endif From 09ea5d8944700be9ce00fdd66f29f34573f33e76 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 2 Jul 2019 12:58:50 -0700 Subject: [PATCH 199/572] Fix clang build with jemalloc (#5522) Summary: Fixes the below build failure for clang compiler using glibc and jemalloc. Platform: linux x86-64 Compiler: clang version 6.0.0-1ubuntu2 Build failure: ``` $ CXX=clang++ CC=clang USE_CLANG=1 WITH_JEMALLOC_FLAG=1 JEMALLOC=1 EXTRA_LDFLAGS="-L/home/andrew/jemalloc/lib/" EXTRA_CXXFLAGS="-I/home/andrew/jemalloc/include/" make check -j12 ... CC memory/jemalloc_nodump_allocator.o In file included from memory/jemalloc_nodump_allocator.cc:6: In file included from ./memory/jemalloc_nodump_allocator.h:11: In file included from ./port/jemalloc_helper.h:16: /usr/include/clang/6.0.0/include/mm_malloc.h:39:16: error: 'posix_memalign' is missing exception specification 'throw()' extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size); ^ /home/andrew/jemalloc/include/jemalloc/jemalloc.h:388:26: note: expanded from macro 'posix_memalign' # define posix_memalign je_posix_memalign ^ /home/andrew/jemalloc/include/jemalloc/jemalloc.h:77:29: note: expanded from macro 'je_posix_memalign' # define je_posix_memalign posix_memalign ^ /home/andrew/jemalloc/include/jemalloc/jemalloc.h:232:38: note: previous declaration is here JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_posix_memalign(void **memptr, ^ /home/andrew/jemalloc/include/jemalloc/jemalloc.h:77:29: note: expanded from macro 'je_posix_memalign' # define je_posix_memalign posix_memalign ^ 1 error generated. Makefile:1972: recipe for target 'memory/jemalloc_nodump_allocator.o' failed make: *** [memory/jemalloc_nodump_allocator.o] Error 1 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5522 Differential Revision: D16069869 Pulled By: miasantreble fbshipit-source-id: c489bbc993adee194b9a550134c6237a264bc443 --- port/jemalloc_helper.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index 6aeb780ee6e..a9095ec98dc 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -5,6 +5,19 @@ #pragma once +#if defined(__clang__) +// glibc's `posix_memalign()` declaration specifies `throw()` while clang's +// declaration does not. There is a hack in clang to make its re-declaration +// compatible with glibc's if they are declared consecutively. That hack breaks +// if yet another `posix_memalign()` declaration comes between glibc's and +// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's +// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration. +// +// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()` +// declaration did not specify `throw()` when built with clang. +#include +#endif + #ifdef ROCKSDB_JEMALLOC #ifdef __FreeBSD__ #include From 84c5c9aab15896e1c55c3febfa1fac5ed2009069 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Wed, 3 Jul 2019 18:36:08 -0700 Subject: [PATCH 200/572] Fix a bug in compaction reads causing checksum mismatches and asan errors (#5531) Summary: Fixed a bug in compaction reads due to which incorrect number of bytes were being read/utilized. The bug was introduced in https://github.com/facebook/rocksdb/issues/5498 , resulting in "Corruption: block checksum mismatch" and "heap-buffer-overflow" asan errors in our tests. https://github.com/facebook/rocksdb/issues/5498 was introduced recently and is not in any released versions. ASAN: ``` > ==2280939==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6250005e83da at pc 0x000000d57f62 bp 0x7f954f483770 sp 0x7f954f482f20 > === How to use this, how to get the raw stack trace, and more: fburl.com/ASAN === > READ of size 4 at 0x6250005e83da thread T4 > SCARINESS: 27 (4-byte-read-heap-buffer-overflow-far-from-bounds) > #0 tests+0xd57f61 __asan_memcpy > https://github.com/facebook/rocksdb/issues/1 rocksdb/src/util/coding.h:124 rocksdb::DecodeFixed32(char const*) > https://github.com/facebook/rocksdb/issues/2 rocksdb/src/table/block_fetcher.cc:39 rocksdb::BlockFetcher::CheckBlockChecksum() > https://github.com/facebook/rocksdb/issues/3 rocksdb/src/table/block_fetcher.cc:99 rocksdb::BlockFetcher::TryGetFromPrefetchBuffer() > https://github.com/facebook/rocksdb/issues/4 rocksdb/src/table/block_fetcher.cc:209 rocksdb::BlockFetcher::ReadBlockContents() > https://github.com/facebook/rocksdb/issues/5 rocksdb/src/table/block_based/block_based_table_reader.cc:93 rocksdb::(anonymous namespace)::ReadBlockFromFile(rocksdb::RandomAccessFileReader*, rocksdb::FilePrefetchBuffer*, rocksdb::Footer const&, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, std::unique_ptr<...>*, rocksdb::ImmutableCFOptions const&, bool, bool, rocksdb::UncompressionDict const&, rocksdb::PersistentCacheOptions const&, unsigned long, unsigned long, rocksdb::MemoryAllocator*, bool) > https://github.com/facebook/rocksdb/issues/6 rocksdb/src/table/block_based/block_based_table_reader.cc:2331 rocksdb::BlockBasedTable::RetrieveBlock(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<...>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool) const > https://github.com/facebook/rocksdb/issues/7 rocksdb/src/table/block_based/block_based_table_reader.cc:2090 rocksdb::DataBlockIter* rocksdb::BlockBasedTable::NewDataBlockIterator<...>(rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::DataBlockIter*, rocksdb::BlockType, bool, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::Status, rocksdb::FilePrefetchBuffe r*, bool) const > https://github.com/facebook/rocksdb/issues/8 rocksdb/src/table/block_based/block_based_table_reader.cc:2720 rocksdb::BlockBasedTableIterator<...>::InitDataBlock() > https://github.com/facebook/rocksdb/issues/9 rocksdb/src/table/block_based/block_based_table_reader.cc:2607 rocksdb::BlockBasedTableIterator<...>::SeekToFirst() > https://github.com/facebook/rocksdb/issues/10 rocksdb/src/table/iterator_wrapper.h:83 rocksdb::IteratorWrapperBase<...>::SeekToFirst() > https://github.com/facebook/rocksdb/issues/11 rocksdb/src/table/merging_iterator.cc:100 rocksdb::MergingIterator::SeekToFirst() > https://github.com/facebook/rocksdb/issues/12 rocksdb/compaction/compaction_job.cc:877 rocksdb::CompactionJob::ProcessKeyValueCompaction(rocksdb::CompactionJob::SubcompactionState*) > https://github.com/facebook/rocksdb/issues/13 rocksdb/compaction/compaction_job.cc:590 rocksdb::CompactionJob::Run() > https://github.com/facebook/rocksdb/issues/14 rocksdb/db_impl/db_impl_compaction_flush.cc:2689 rocksdb::DBImpl::BackgroundCompaction(bool*, rocksdb::JobContext*, rocksdb::LogBuffer*, rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority) > https://github.com/facebook/rocksdb/issues/15 rocksdb/db_impl/db_impl_compaction_flush.cc:2248 rocksdb::DBImpl::BackgroundCallCompaction(rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority) > https://github.com/facebook/rocksdb/issues/16 rocksdb/db_impl/db_impl_compaction_flush.cc:2024 rocksdb::DBImpl::BGWorkCompaction(void*) > https://github.com/facebook/rocksdb/issues/23 rocksdb/src/util/threadpool_imp.cc:266 rocksdb::ThreadPoolImpl::Impl::BGThread(unsigned long) > https://github.com/facebook/rocksdb/issues/24 rocksdb/src/util/threadpool_imp.cc:307 rocksdb::ThreadPoolImpl::Impl::BGThreadWrapper(void*) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5531 Test Plan: Verified that this fixes the fb-internal Logdevice test which caught the issue. Differential Revision: D16109702 Pulled By: sagar0 fbshipit-source-id: 1fc08549cf7b553e338a133ae11eb9f4d5011914 --- util/file_reader_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index f49866d13e7..db16e82ae11 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -842,7 +842,7 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, assert(max_readahead_size_ >= readahead_size_); Status s; if (for_compaction) { - s = Prefetch(file_reader_, offset, readahead_size_, for_compaction); + s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), for_compaction); } else { s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); } From 6edc5d0719d9739e06e860a065f1f873844b836c Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Wed, 3 Jul 2019 18:45:36 -0700 Subject: [PATCH 201/572] Block cache tracing: Associate a unique id with Get and MultiGet (#5514) Summary: This PR associates a unique id with Get and MultiGet. This enables us to track how many blocks a Get/MultiGet request accesses. We can also measure the impact of row cache vs block cache. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5514 Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32 Differential Revision: D16032681 Pulled By: HaoyuHuang fbshipit-source-id: 775b05f4440badd58de6667e3ec9f4fc87a0af4c --- db/version_set.cc | 16 ++++- table/block_based/block_based_table_reader.cc | 63 +++++++++++++------ table/get_context.cc | 19 +++--- table/get_context.h | 8 ++- table/table_test.cc | 46 +++++++------- trace_replay/block_cache_tracer.cc | 29 ++++++++- trace_replay/block_cache_tracer.h | 38 +++++++---- trace_replay/block_cache_tracer_test.cc | 38 +++++++++++ 8 files changed, 191 insertions(+), 66 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 3354959aac4..226ba0e7e59 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1663,11 +1663,17 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } PinnedIteratorsManager pinned_iters_mgr; + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, value, value_found, merge_context, max_covering_tombstone_seq, this->env_, - seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob); + seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_get_id); // Pin blocks that we read to hold merge operands if (merge_operator_) { @@ -1785,7 +1791,12 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, if (merge_operator_) { pinned_iters_mgr.StartPinning(); } + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); + } // Even though we know the batch size won't be > MAX_BATCH_SIZE, // use autovector in order to avoid unnecessary construction of GetContext // objects, which is expensive @@ -1797,7 +1808,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, iter->value, nullptr, &(iter->merge_context), &iter->max_covering_tombstone_seq, this->env_, &iter->seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob); + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_mget_id); } int get_ctx_index = 0; for (auto iter = range->begin(); iter != range->end(); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 87756f2e240..65bc6dfbc11 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1983,7 +1983,7 @@ CachableEntry BlockBasedTable::GetFilter( /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io); + /*no_insert=*/no_io, lookup_context->get_id); block_cache_tracer_->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(), /*referenced_key=*/nullptr); @@ -2065,7 +2065,7 @@ CachableEntry BlockBasedTable::GetUncompressionDict( /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io); + /*no_insert=*/no_io, lookup_context->get_id); block_cache_tracer_->WriteBlockAccess(access_record, cache_key, rep_->cf_name_for_tracing(), /*referenced_key=*/nullptr); @@ -2426,7 +2426,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - no_insert); + no_insert, lookup_context->get_id); block_cache_tracer_->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(), /*referenced_key=*/nullptr); @@ -3340,7 +3340,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, CachableEntry filter_entry; bool may_match; FilterBlockReader* filter = nullptr; - BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet}; + uint64_t tracing_get_id = get_context ? get_context->tracing_get_id() + : BlockCacheTraceHelper::kReservedGetId; + BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, + tracing_get_id}; { if (!skip_filters) { filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, @@ -3406,7 +3409,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } BlockCacheLookupContext lookup_data_block_context{ - TableReaderCaller::kUserGet}; + TableReaderCaller::kUserGet, tracing_get_id}; bool does_referenced_key_exist = false; DataBlockIter biter; uint64_t referenced_data_size = 0; @@ -3447,8 +3450,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, if (!get_context->SaveValue( parsed_key, biter.value(), &matched, biter.IsValuePinned() ? &biter : nullptr)) { - does_referenced_key_exist = true; - referenced_data_size = biter.key().size() + biter.value().size(); + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + } done = true; break; } @@ -3459,6 +3464,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { // Avoid making copy of block_key, cf_name, and referenced_key when // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter.key(); + } else { + referenced_key = ExtractUserKey(key); + } BlockCacheTraceRecord access_record( rep_->ioptions.env->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, @@ -3467,12 +3478,13 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, rep_->sst_number_for_tracing(), lookup_data_block_context.caller, lookup_data_block_context.is_cache_hit, lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); block_cache_tracer_->WriteBlockAccess( access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), key); + rep_->cf_name_for_tracing(), referenced_key); } if (done) { @@ -3498,14 +3510,19 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { - BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet}; const bool no_io = read_options.read_tier == kBlockCacheTier; CachableEntry filter_entry; FilterBlockReader* filter = nullptr; MultiGetRange sst_file_range(*mget_range, mget_range->begin(), mget_range->end()); - { - if (!skip_filters) { + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { + tracing_mget_id = sst_file_range.begin()->get_context->tracing_get_id(); + } + BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet, + tracing_mget_id}; + if (!skip_filters) { + { // TODO: Figure out where the stats should go filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, read_options.read_tier == kBlockCacheTier, @@ -3644,7 +3661,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet); + TableReaderCaller::kUserMultiGet, tracing_mget_id); if (first_block) { if (!block_handles[idx_in_batch].IsNull() || !results[idx_in_batch].IsEmpty()) { @@ -3703,7 +3720,6 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, ParsedInternalKey parsed_key; Cleanable dummy; Cleanable* value_pinner = nullptr; - if (!ParseInternalKey(biter->key(), &parsed_key)) { s = Status::Corruption(Slice()); } @@ -3719,11 +3735,13 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, value_pinner = biter; } } - - if (!get_context->SaveValue( - parsed_key, biter->value(), &matched, value_pinner)) { - does_referenced_key_exist = true; - referenced_data_size = biter->key().size() + biter->value().size(); + if (!get_context->SaveValue(parsed_key, biter->value(), &matched, + value_pinner)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = + biter->key().size() + biter->value().size(); + } done = true; break; } @@ -3733,6 +3751,12 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { // Avoid making copy of block_key, cf_name, and referenced_key when // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter->key(); + } else { + referenced_key = ExtractUserKey(key); + } BlockCacheTraceRecord access_record( rep_->ioptions.env->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, @@ -3741,12 +3765,13 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, rep_->sst_number_for_tracing(), lookup_data_block_context.caller, lookup_data_block_context.is_cache_hit, lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); block_cache_tracer_->WriteBlockAccess( access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), key); + rep_->cf_name_for_tracing(), referenced_key); } s = biter->status(); if (done) { diff --git a/table/get_context.cc b/table/get_context.cc index 9be16b0627d..f0c7928bf42 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -38,15 +38,13 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { } // namespace -GetContext::GetContext(const Comparator* ucmp, - const MergeOperator* merge_operator, Logger* logger, - Statistics* statistics, GetState init_state, - const Slice& user_key, PinnableSlice* pinnable_val, - bool* value_found, MergeContext* merge_context, - SequenceNumber* _max_covering_tombstone_seq, Env* env, - SequenceNumber* seq, - PinnedIteratorsManager* _pinned_iters_mgr, - ReadCallback* callback, bool* is_blob_index) +GetContext::GetContext( + const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, const Slice& user_key, + PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, + SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, + bool* is_blob_index, uint64_t tracing_get_id) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -62,7 +60,8 @@ GetContext::GetContext(const Comparator* ucmp, replay_log_(nullptr), pinned_iters_mgr_(_pinned_iters_mgr), callback_(callback), - is_blob_index_(is_blob_index) { + is_blob_index_(is_blob_index), + tracing_get_id_(tracing_get_id) { if (seq_) { *seq_ = kMaxSequenceNumber; } diff --git a/table/get_context.h b/table/get_context.h index ddce33fb3be..f567229cc9f 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -85,7 +85,8 @@ class GetContext { SequenceNumber* max_covering_tombstone_seq, Env* env, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, - ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + uint64_t tracing_get_id = 0); GetContext() = default; @@ -135,6 +136,8 @@ class GetContext { void ReportCounters(); + uint64_t tracing_get_id() const { return tracing_get_id_; } + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -158,6 +161,9 @@ class GetContext { ReadCallback* callback_; bool sample_; bool* is_blob_index_; + // Used for block cache tracing only. A tracing get id uniquely identifies a + // Get or a MultiGet. + const uint64_t tracing_get_id_; }; // Call this to replay a log and bring the get_context up to date. The replay diff --git a/table/table_test.cc b/table/table_test.cc index 418ecf004b7..c3a1f82ed37 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2563,23 +2563,25 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto reader = c.GetTableReader(); PinnableSlice value; - GetContext get_context(options.comparator, nullptr, nullptr, nullptr, - GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); - get_perf_context()->Reset(); - ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, - moptions.prefix_extractor.get())); - if (index_and_filter_in_cache) { - // data, index and filter block - ASSERT_EQ(get_perf_context()->block_read_count, 3); - ASSERT_EQ(get_perf_context()->index_block_read_count, 1); - ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); - } else { - // just the data block - ASSERT_EQ(get_perf_context()->block_read_count, 1); + { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + if (index_and_filter_in_cache) { + // data, index and filter block + ASSERT_EQ(get_perf_context()->block_read_count, 3); + ASSERT_EQ(get_perf_context()->index_block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); + } else { + // just the data block + ASSERT_EQ(get_perf_context()->block_read_count, 1); + } + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_STREQ(value.data(), "hello"); } - ASSERT_EQ(get_context.State(), GetContext::kFound); - ASSERT_STREQ(value.data(), "hello"); // Get non-existing key user_key = "does-not-exist"; @@ -2587,13 +2589,15 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { encoded_key = internal_key.Encode().ToString(); value.Reset(); - get_context = GetContext(options.comparator, nullptr, nullptr, nullptr, + { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, nullptr, nullptr, nullptr); - get_perf_context()->Reset(); - ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, - moptions.prefix_extractor.get())); - ASSERT_EQ(get_context.State(), GetContext::kNotFound); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + } if (index_and_filter_in_cache) { if (bloom_filter_type == 0) { diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index cc875bf0dcd..115a75d924b 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -31,6 +31,7 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) { const uint64_t kMicrosInSecond = 1000 * 1000; const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = "UnknownColumnFamily"; +const uint64_t BlockCacheTraceHelper::kReservedGetId = 0; bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type, TableReaderCaller caller) { @@ -39,6 +40,11 @@ bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type, caller == TableReaderCaller::kUserMultiGet); } +bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) { + return caller == TableReaderCaller::kUserGet || + caller == TableReaderCaller::kUserMultiGet; +} + BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, std::unique_ptr&& trace_writer) @@ -65,6 +71,9 @@ Status BlockCacheTraceWriter::WriteBlockAccess( trace.payload.push_back(record.caller); trace.payload.push_back(record.is_cache_hit); trace.payload.push_back(record.no_insert); + if (BlockCacheTraceHelper::ShouldTraceGetId(record.caller)) { + PutFixed64(&trace.payload, record.get_id); + } if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type, record.caller)) { PutLengthPrefixedSlice(&trace.payload, referenced_key); @@ -197,7 +206,12 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { } record->no_insert = static_cast(enc_slice[0]); enc_slice.remove_prefix(kCharSize); - + if (BlockCacheTraceHelper::ShouldTraceGetId(record->caller)) { + if (!GetFixed64(&enc_slice, &record->get_id)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the get id."); + } + } if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type, record->caller)) { Slice referenced_key; @@ -236,6 +250,7 @@ Status BlockCacheTracer::StartTrace( if (writer_.load()) { return Status::Busy(); } + get_id_counter_.store(1); trace_options_ = trace_options; writer_.store( new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer))); @@ -266,4 +281,16 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record, referenced_key); } +uint64_t BlockCacheTracer::NextGetId() { + if (!writer_.load(std::memory_order_relaxed)) { + return BlockCacheTraceHelper::kReservedGetId; + } + uint64_t prev_value = get_id_counter_.fetch_add(1); + if (prev_value == BlockCacheTraceHelper::kReservedGetId) { + // fetch and add again. + return get_id_counter_.fetch_add(1); + } + return prev_value; +} + } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index e21111727c9..4788a3f447f 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -18,6 +18,16 @@ namespace rocksdb { extern const uint64_t kMicrosInSecond; +class BlockCacheTraceHelper { + public: + static bool ShouldTraceReferencedKey(TraceType block_type, + TableReaderCaller caller); + static bool ShouldTraceGetId(TableReaderCaller caller); + + static const std::string kUnknownColumnFamilyName; + static const uint64_t kReservedGetId; +}; + // Lookup context for tracing block cache accesses. // We trace block accesses at five places: // 1. BlockBasedTable::GetFilter @@ -38,8 +48,10 @@ extern const uint64_t kMicrosInSecond; // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or // kUserApproximateSize). struct BlockCacheLookupContext { -BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} -const TableReaderCaller caller; + BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} + BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id) + : caller(_caller), get_id(_get_id) {} + const TableReaderCaller caller; // These are populated when we perform lookup/insert on block cache. The block // cache tracer uses these inforation when logging the block access at // BlockBasedTable::GET and BlockBasedTable::MultiGet. @@ -49,6 +61,10 @@ const TableReaderCaller caller; uint64_t block_size = 0; std::string block_key; uint64_t num_keys_in_block = 0; + // The unique id associated with Get and MultiGet. This enables us to track + // how many blocks a Get/MultiGet request accesses. We can also measure the + // impact of row cache vs block cache. + uint64_t get_id = 0; void FillLookupContext(bool _is_cache_hit, bool _no_insert, TraceType _block_type, uint64_t _block_size, @@ -78,7 +94,8 @@ struct BlockCacheTraceRecord { TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller; Boolean is_cache_hit = Boolean::kFalse; Boolean no_insert = Boolean::kFalse; - + // Required field for Get and MultiGet + uint64_t get_id = BlockCacheTraceHelper::kReservedGetId; // Required fields for data block and user Get/Multi-Get only. std::string referenced_key; uint64_t referenced_data_size = 0; @@ -91,7 +108,7 @@ struct BlockCacheTraceRecord { TraceType _block_type, uint64_t _block_size, uint64_t _cf_id, std::string _cf_name, uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller, - bool _is_cache_hit, bool _no_insert, + bool _is_cache_hit, bool _no_insert, uint64_t _get_id, std::string _referenced_key = "", uint64_t _referenced_data_size = 0, uint64_t _num_keys_in_block = 0, @@ -107,6 +124,7 @@ struct BlockCacheTraceRecord { caller(_caller), is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse), no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse), + get_id(_get_id), referenced_key(_referenced_key), referenced_data_size(_referenced_data_size), num_keys_in_block(_num_keys_in_block), @@ -121,14 +139,6 @@ struct BlockCacheTraceHeader { uint32_t rocksdb_minor_version; }; -class BlockCacheTraceHelper { - public: - static bool ShouldTraceReferencedKey(TraceType block_type, - TableReaderCaller caller); - - static const std::string kUnknownColumnFamilyName; -}; - // BlockCacheTraceWriter captures all RocksDB block cache accesses using a // user-provided TraceWriter. Every RocksDB operation is written as a single // trace. Each trace will have a timestamp and type, followed by the trace @@ -207,11 +217,15 @@ class BlockCacheTracer { const Slice& block_key, const Slice& cf_name, const Slice& referenced_key); + // GetId cycles from 1 to port::kMaxUint64. + uint64_t NextGetId(); + private: TraceOptions trace_options_; // A mutex protects the writer_. InstrumentedMutex trace_writer_mutex_; std::atomic writer_; + std::atomic get_id_counter_; }; } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index e7a5881044f..aae513ad5d7 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -71,6 +71,9 @@ class BlockCacheTracerTest : public testing::Test { record.sst_fd_number = kSSTFDNumber + key_id; record.is_cache_hit = Boolean::kFalse; record.no_insert = Boolean::kFalse; + // Provide get_id for all callers. The writer should only write get_id + // when the caller is either GET or MGET. + record.get_id = key_id + 1; // Provide these fields for all block types. // The writer should only write these fields for data blocks and the // caller is either GET or MGET. @@ -120,6 +123,12 @@ class BlockCacheTracerTest : public testing::Test { ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number); ASSERT_EQ(Boolean::kFalse, record.is_cache_hit); ASSERT_EQ(Boolean::kFalse, record.no_insert); + if (record.caller == TableReaderCaller::kUserGet || + record.caller == TableReaderCaller::kUserMultiGet) { + ASSERT_EQ(key_id + 1, record.get_id); + } else { + ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id); + } if (block_type == TraceType::kBlockTraceDataBlock && (record.caller == TableReaderCaller::kUserGet || record.caller == TableReaderCaller::kUserMultiGet)) { @@ -239,6 +248,35 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) { } } +TEST_F(BlockCacheTracerTest, NextGetId) { + BlockCacheTracer writer; + { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + // next get id should always return 0 before we call StartTrace. + ASSERT_EQ(0, writer.NextGetId()); + ASSERT_EQ(0, writer.NextGetId()); + ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_EQ(1, writer.NextGetId()); + ASSERT_EQ(2, writer.NextGetId()); + writer.EndTrace(); + // next get id should return 0. + ASSERT_EQ(0, writer.NextGetId()); + } + + // Start trace again and next get id should return 1. + { + TraceOptions trace_opt; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); + ASSERT_EQ(1, writer.NextGetId()); + } +} + TEST_F(BlockCacheTracerTest, MixedBlocks) { { // Generate a trace file containing a mix of blocks. From e4dcf5fd22509ae6741733a0f02feb7b68421f55 Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 3 Jul 2019 19:03:29 -0700 Subject: [PATCH 202/572] db_bench to add a new "benchmark" to print out all stats history (#5532) Summary: Sometimes it is helpful to fetch the whole history of stats after benchmark runs. Add such an option Pull Request resolved: https://github.com/facebook/rocksdb/pull/5532 Test Plan: Run the benchmark manually and observe the output is as expected. Differential Revision: D16097764 fbshipit-source-id: 10b5b735a22a18be198b8f348be11f11f8806904 --- HISTORY.md | 1 + tools/db_bench_tool.cc | 36 ++++++++++++++++++++++++++++++++++++ tools/ldb_cmd.cc | 32 +++++++++++--------------------- util/string_util.cc | 11 +++++++++++ util/string_util.h | 4 ++++ 5 files changed, 63 insertions(+), 21 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c3af6ba06d7..6e0fcc54efb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,6 +13,7 @@ * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. * Add C bindings for secondary instance, i.e. DBImplSecondary. +* db_bench adds a "benchmark" stats_history, which prints out the whole stats history. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index abffae5d9e8..39f9eebc7e0 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -49,6 +49,7 @@ #include "rocksdb/rate_limiter.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/stats_history.h" #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/options_util.h" @@ -2867,6 +2868,8 @@ class Benchmark { PrintStats("rocksdb.levelstats"); } else if (name == "sstables") { PrintStats("rocksdb.sstables"); + } else if (name == "stats_history") { + PrintStatsHistory(); } else if (name == "replay") { if (num_threads > 1) { fprintf(stderr, "Multi-threaded replay is not yet supported\n"); @@ -6259,6 +6262,39 @@ class Benchmark { } } + void PrintStatsHistory() { + if (db_.db != nullptr) { + PrintStatsHistoryImpl(db_.db, false); + } + for (const auto& db_with_cfh : multi_dbs_) { + PrintStatsHistoryImpl(db_with_cfh.db, true); + } + } + + void PrintStatsHistoryImpl(DB* db, bool print_header) { + if (print_header) { + fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); + } + + std::unique_ptr shi; + Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi); + if (!s.ok()) { + fprintf(stdout, "%s\n", s.ToString().c_str()); + return; + } + assert(shi); + while (shi->Valid()) { + uint64_t stats_time = shi->GetStatsTime(); + fprintf(stdout, "------ %s ------\n", + TimeToHumanString(static_cast(stats_time)).c_str()); + for (auto& entry : shi->GetStatsMap()) { + fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time, + entry.first.c_str(), entry.second); + } + shi->Next(); + } + } + void PrintStats(const char* key) { if (db_.db != nullptr) { PrintStats(db_.db, key, false); diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index a1507b188b2..fba32d9d622 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -859,8 +859,7 @@ void CompactorCommand::DoCommand() { delete end; } -// ---------------------------------------------------------------------------- - +// --------------------------------------------------------------------------- const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; const std::string DBLoaderCommand::ARG_COMPACT = "compact"; @@ -1168,19 +1167,8 @@ void DropColumnFamilyCommand::DoCommand() { } // ---------------------------------------------------------------------------- - namespace { -std::string ReadableTime(int unixtime) { - char time_buffer [80]; - time_t rawtime = unixtime; - struct tm tInfo; - struct tm* timeinfo = localtime_r(&rawtime, &tInfo); - assert(timeinfo == &tInfo); - strftime(time_buffer, 80, "%c", timeinfo); - return std::string(time_buffer); -} - // This function only called when it's the sane case of >1 buckets in time-range // Also called only when timekv falls between ttl_start and ttl_end provided void IncBucketCounts(std::vector& bucket_counts, int ttl_start, @@ -1202,13 +1190,13 @@ void PrintBucketCounts(const std::vector& bucket_counts, int time_point = ttl_start; for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { fprintf(stdout, "Keys in range %s to %s : %lu\n", - ReadableTime(time_point).c_str(), - ReadableTime(time_point + bucket_size).c_str(), + TimeToHumanString(time_point).c_str(), + TimeToHumanString(time_point + bucket_size).c_str(), (unsigned long)bucket_counts[i]); } fprintf(stdout, "Keys in range %s to %s : %lu\n", - ReadableTime(time_point).c_str(), - ReadableTime(ttl_end).c_str(), + TimeToHumanString(time_point).c_str(), + TimeToHumanString(ttl_end).c_str(), (unsigned long)bucket_counts[num_buckets - 1]); } @@ -1564,7 +1552,8 @@ void DBDumperCommand::DoDumpCommand() { std::vector bucket_counts(num_buckets, 0); if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { fprintf(stdout, "Dumping key-values from %s to %s\n", - ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + TimeToHumanString(ttl_start).c_str(), + TimeToHumanString(ttl_end).c_str()); } HistogramImpl vsize_hist; @@ -1619,7 +1608,7 @@ void DBDumperCommand::DoDumpCommand() { if (!count_only_ && !count_delim_) { if (is_db_ttl_ && timestamp_) { - fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str()); } std::string str = PrintKeyValue(iter->key().ToString(), iter->value().ToString(), @@ -2397,7 +2386,8 @@ void ScanCommand::DoCommand() { } if (is_db_ttl_ && timestamp_) { fprintf(stdout, "Scanning key-values from %s to %s\n", - ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + TimeToHumanString(ttl_start).c_str(), + TimeToHumanString(ttl_end).c_str()); } for ( ; it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); @@ -2409,7 +2399,7 @@ void ScanCommand::DoCommand() { continue; } if (timestamp_) { - fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str()); } } diff --git a/util/string_util.cc b/util/string_util.cc index 74f6afbf0f4..9b447d50ce3 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -18,6 +18,7 @@ #include #include "rocksdb/env.h" #include "port/port.h" +#include "port/sys_time.h" #include "rocksdb/slice.h" namespace rocksdb { @@ -139,6 +140,16 @@ std::string BytesToHumanString(uint64_t bytes) { return std::string(buf); } +std::string TimeToHumanString(int unixtime) { + char time_buffer[80]; + time_t rawtime = unixtime; + struct tm tInfo; + struct tm* timeinfo = localtime_r(&rawtime, &tInfo); + assert(timeinfo == &tInfo); + strftime(time_buffer, 80, "%c", timeinfo); + return std::string(time_buffer); +} + std::string EscapeString(const Slice& value) { std::string r; AppendEscapedStringTo(&r, value); diff --git a/util/string_util.h b/util/string_util.h index 6e125ddfa8f..faf763e54a1 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -50,6 +50,10 @@ extern std::string NumberToHumanString(int64_t num); // ex: 1048576 -> 1.00 GB extern std::string BytesToHumanString(uint64_t bytes); +// Return a human-readable version of unix time +// ex: 1562116015 -> "Tue Jul 2 18:06:55 2019" +extern std::string TimeToHumanString(int unixtime); + // Append a human-readable time in micros. int AppendHumanMicros(uint64_t micros, char* output, int len, bool fixed_format); From 4f66ec977d9b8b83c0b7e16d25a43281cd6a8073 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Thu, 4 Jul 2019 17:24:33 -0700 Subject: [PATCH 203/572] Fix lower bound check error when iterate across file boundary (#5540) Summary: Since https://github.com/facebook/rocksdb/issues/5468 `LevelIterator` compare lower bound and file smallest key on `NewFileIterator` and cache the result to reduce per key lower bound check. However when iterate across file boundary, it doesn't update the cached result since `Valid()=false` because `Valid()` still reflect the status of the previous file iterator. Fixing it by remove the `Valid()` check from `CheckMayBeOutOfLowerBound()`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5540 Test Plan: See the new test. Signed-off-by: Yi Wu Differential Revision: D16127653 fbshipit-source-id: a0691e1164658d485c17971aaa97028812f74678 --- db/db_iterator_test.cc | 26 ++++++++++++++++++++++++++ db/version_set.cc | 3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 67a97b20b81..997b38602c4 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2821,6 +2821,32 @@ TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { delete iter; } +TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) { + ASSERT_OK(Put("aaa", "v")); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + // Move both files to bottom level. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Slice lower_bound("b"); + ReadOptions read_opts; + read_opts.iterate_lower_bound = &lower_bound; + std::unique_ptr iter(NewIterator(read_opts)); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/version_set.cc b/db/version_set.cc index 226ba0e7e59..32dd61db830 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -984,7 +984,8 @@ class LevelIterator final : public InternalIterator { // Note MyRocks may update iterate bounds between seek. To workaround it, // we need to check and update may_be_out_of_lower_bound_ accordingly. void CheckMayBeOutOfLowerBound() { - if (Valid() && read_options_.iterate_lower_bound != nullptr) { + if (read_options_.iterate_lower_bound != nullptr && + file_index_ < flevel_->num_files) { may_be_out_of_lower_bound_ = user_comparator_.Compare( ExtractUserKey(file_smallest_key(file_index_)), From 2de61d91293eb2ec2185d2bbe2b2eebc55db94cc Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 5 Jul 2019 12:28:48 -0700 Subject: [PATCH 204/572] Assert get_context not null in BlockBasedTable::Get() (#5542) Summary: clang analyze fails after https://github.com/facebook/rocksdb/pull/5514 for this failure: table/block_based/block_based_table_reader.cc:3450:16: warning: Called C++ object pointer is null if (!get_context->SaveValue( ^~~~~~~~~~~~~~~~~~~~~~~ 1 warning generated. The reaon is that a branching is added earlier in the function on get_context is null or not, CLANG analyze thinks that it can be null and we make the function call withou the null checking. Fix the issue by removing the branch and add an assert. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5542 Test Plan: "make all check" passes and CLANG analyze failure goes away. Differential Revision: D16133988 fbshipit-source-id: d4627d03c4746254cc11926c523931086ccebcda --- table/block_based/block_based_table_reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 65bc6dfbc11..baa5c397eb7 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -3335,13 +3335,13 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, const SliceTransform* prefix_extractor, bool skip_filters) { assert(key.size() >= 8); // key must be internal key + assert(get_context != nullptr); Status s; const bool no_io = read_options.read_tier == kBlockCacheTier; CachableEntry filter_entry; bool may_match; FilterBlockReader* filter = nullptr; - uint64_t tracing_get_id = get_context ? get_context->tracing_get_id() - : BlockCacheTraceHelper::kReservedGetId; + uint64_t tracing_get_id = get_context->tracing_get_id(); BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, tracing_get_id}; { From e0d9d57750cb348f376ddd022276e8f493dd1e17 Mon Sep 17 00:00:00 2001 From: anand76 Date: Sat, 6 Jul 2019 21:04:22 -0700 Subject: [PATCH 205/572] Fix bugs in WAL trash file handling (#5520) Summary: 1. Cleanup WAL trash files on open 2. Don't apply deletion rate limit if WAL dir is different from db dir Pull Request resolved: https://github.com/facebook/rocksdb/pull/5520 Test Plan: Add new unit tests and make check Differential Revision: D16096750 Pulled By: anand1976 fbshipit-source-id: 6f07858ad864b754b711db416f0389c45ede599b --- HISTORY.md | 2 + db/db_impl/db_impl.cc | 14 ++-- db/db_impl/db_impl.h | 2 + db/db_impl/db_impl_files.cc | 3 +- db/db_impl/db_impl_open.cc | 23 +++++++ db/db_sst_test.cc | 105 ++++++++++++++++++++++++++++++ db/wal_manager.cc | 9 ++- db/wal_manager.h | 6 +- file/file_util.cc | 17 ++++- file/file_util.h | 6 +- utilities/blob_db/blob_db_impl.cc | 6 +- 11 files changed, 177 insertions(+), 16 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6e0fcc54efb..c425c578f87 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,6 +14,7 @@ * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. * Add C bindings for secondary instance, i.e. DBImplSecondary. * db_bench adds a "benchmark" stats_history, which prints out the whole stats history. +* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. @@ -40,6 +41,7 @@ * Fix ingested file and directory not being fsync. * Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. +* On DB open, delete WAL trash files left behind in wal_dir ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 55f89eab32e..cf8dddb7fe1 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -3137,6 +3137,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); Env* env = soptions.env; std::vector filenames; + bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions); // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal @@ -3159,7 +3160,9 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); } else if (type == kTableFile || type == kLogFile) { - del = DeleteDBFile(&soptions, path_to_delete, dbname); + del = + DeleteDBFile(&soptions, path_to_delete, dbname, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); } else { del = env->DeleteFile(path_to_delete); } @@ -3193,7 +3196,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (ParseFileName(fname, &number, &type) && type == kTableFile) { // Lock file will be deleted at end std::string table_path = path + "/" + fname; - Status del = DeleteDBFile(&soptions, table_path, dbname); + Status del = DeleteDBFile(&soptions, table_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); if (result.ok() && !del.ok()) { result = del; } @@ -3220,7 +3224,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, for (const auto& file : archiveFiles) { if (ParseFileName(file, &number, &type) && type == kLogFile) { Status del = - DeleteDBFile(&soptions, archivedir + "/" + file, archivedir); + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); if (result.ok() && !del.ok()) { result = del; } @@ -3235,7 +3240,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (ParseFileName(file, &number, &type) && type == kLogFile) { Status del = DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), - soptions.wal_dir); + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); if (result.ok() && !del.ok()) { result = del; } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 737f2337608..d417035b1ef 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1893,6 +1893,8 @@ class DBImpl : public DB { // results sequentially. Flush results of memtables with lower IDs get // installed to MANIFEST first. InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; }; extern Options SanitizeOptions(const std::string& db, const Options& src); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index c018432c9b8..7afe3955e5b 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -258,7 +258,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, Status file_deletion_status; if (type == kTableFile || type == kLogFile) { file_deletion_status = - DeleteDBFile(&immutable_db_options_, fname, path_to_sync); + DeleteDBFile(&immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); } else { file_deletion_status = env_->DeleteFile(fname); } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 13d6959d474..82e61a260b8 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -122,6 +122,25 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } #ifndef ROCKSDB_LITE + ImmutableDBOptions immutable_db_options(result); + if (!IsWalDirSameAsDBPath(&immutable_db_options)) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + result.env->GetChildren(result.wal_dir, &filenames); + for (std::string& filename : filenames) { + if (filename.find(".log.trash", + filename.length() - std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = result.wal_dir + "/" + filename; + result.env->DeleteFile(trash_file); + } + } + } // When the DB is stopped, it's possible that there are some .trash files that // were not deleted yet, when we open the DB we will find these .trash files // and schedule them to be deleted (or delete immediately if SstFileManager @@ -1294,6 +1313,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, delete impl; return s; } + + impl->wal_in_db_path_ = + IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->mutex_.Lock(); // Handles create_if_missing, error_if_exists s = impl->Recover(column_families); diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 799d0e14f6b..37adee46722 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -470,6 +470,111 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +class DBWALTestWithParam + : public DBSSTTest, + public testing::WithParamInterface> { + public: + DBWALTestWithParam() { + wal_dir_ = std::get<0>(GetParam()); + wal_dir_same_as_dbname_ = std::get<1>(GetParam()); + } + + std::string wal_dir_; + bool wal_dir_same_as_dbname_; +}; + +TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { + class MyEnv : public EnvWrapper { + public: + MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {} + + Status DeleteFile(const std::string& fname) { + if (fname.find(".log.trash") != std::string::npos && fake_log_delete) { + return Status::OK(); + } + + return target()->DeleteFile(fname); + } + + void set_fake_log_delete(bool fake) { fake_log_delete = fake; } + + private: + bool fake_log_delete; + }; + + std::unique_ptr env(new MyEnv(Env::Default())); + Destroy(last_options_); + + env->set_fake_log_delete(true); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env.get(); + options.wal_dir = dbname_ + wal_dir_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + + ASSERT_OK(TryReopen(options)); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + Close(); + + options.sst_file_manager.reset(); + std::vector filenames; + int trash_log_count = 0; + if (!wal_dir_same_as_dbname_) { + // Forcibly create some trash log files + std::unique_ptr result; + env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions()); + result.reset(); + } + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_GE(trash_log_count, 1); + + env->set_fake_log_delete(false); + ASSERT_OK(TryReopen(options)); + + filenames.clear(); + trash_log_count = 0; + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_EQ(trash_log_count, 0); + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam, + ::testing::Values(std::make_tuple("", true), + std::make_tuple("_wal_dir", false))); + TEST_F(DBSSTTest, OpenDBWithExistingTrash) { Options options = CurrentOptions(); diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 58671d599c5..0c996baf549 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -187,7 +187,8 @@ void WalManager::PurgeObsoleteWALFiles() { continue; } if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) { - s = DeleteDBFile(&db_options_, file_path, archival_dir, false); + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); @@ -213,7 +214,8 @@ void WalManager::PurgeObsoleteWALFiles() { log_file_size = std::max(log_file_size, file_size); ++log_files_num; } else { - s = DeleteDBFile(&db_options_, file_path, archival_dir, false); + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", file_path.c_str(), @@ -253,7 +255,8 @@ void WalManager::PurgeObsoleteWALFiles() { for (size_t i = 0; i < files_del_num; ++i) { std::string const file_path = archived_logs[i]->PathName(); s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path, - db_options_.wal_dir, false); + db_options_.wal_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); diff --git a/db/wal_manager.h b/db/wal_manager.h index 9d5afb25d5e..8d185c35076 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -18,6 +18,7 @@ #include #include "db/version_set.h" +#include "file/file_util.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/env.h" @@ -40,7 +41,8 @@ class WalManager { env_options_(env_options), env_(db_options.env), purge_wal_files_last_run_(0), - seq_per_batch_(seq_per_batch) {} + seq_per_batch_(seq_per_batch), + wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {} Status GetSortedWalFiles(VectorLogPtr& files); @@ -97,6 +99,8 @@ class WalManager { bool seq_per_batch_; + bool wal_in_db_path_; + // obsolete files will be deleted every this seconds if ttl deletion is // enabled and archive size_limit is disabled. static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; diff --git a/file/file_util.cc b/file/file_util.cc index 0364f834022..050d25da1a7 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -88,12 +88,12 @@ Status CreateFile(Env* env, const std::string& destination, } Status DeleteDBFile(const ImmutableDBOptions* db_options, - const std::string& fname, const std::string& dir_to_sync, - const bool force_bg) { + const std::string& fname, const std::string& dir_to_sync, + const bool force_bg, const bool force_fg) { #ifndef ROCKSDB_LITE SstFileManagerImpl* sfm = static_cast(db_options->sst_file_manager.get()); - if (sfm) { + if (sfm && !force_fg) { return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); } else { return db_options->env->DeleteFile(fname); @@ -101,10 +101,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, #else (void)dir_to_sync; (void)force_bg; + (void)force_fg; // SstFileManager is not supported in ROCKSDB_LITE // Delete file immediately return db_options->env->DeleteFile(fname); #endif } +bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) { + bool same = false; + Status s = db_options->env->AreFilesSame(db_options->wal_dir, + db_options->db_paths[0].path, &same); + if (s.IsNotSupported()) { + same = db_options->wal_dir == db_options->db_paths[0].path; + } + return same; +} + } // namespace rocksdb diff --git a/file/file_util.h b/file/file_util.h index 9116c1fecfb..75d6d7eb9fe 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -24,7 +24,9 @@ extern Status CreateFile(Env* env, const std::string& destination, extern Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, - const std::string& path_to_sync, - const bool force_bg = false); + const std::string& path_to_sync, const bool force_bg, + const bool force_fg); + +extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options); } // namespace rocksdb diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 86eb1460c15..caa9b098804 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1758,7 +1758,8 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { blob_files_.erase(bfile->BlobFileNumber()); Status s = DeleteDBFile(&(db_impl_->immutable_db_options()), - bfile->PathName(), blob_dir_, true); + bfile->PathName(), blob_dir_, true, + /*force_fg=*/false); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "File failed to be deleted as obsolete %s", @@ -1848,7 +1849,8 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options, uint64_t number; FileType type; if (ParseFileName(f, &number, &type) && type == kBlobFile) { - Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true); + Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true, + /*force_fg=*/false); if (status.ok() && !del.ok()) { status = del; } From 8d34806972ad8867ede364feaa9d403e79b87d35 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Sun, 7 Jul 2019 21:29:39 -0700 Subject: [PATCH 206/572] setup wal_in_db_path_ for secondary instance (#5545) Summary: PR https://github.com/facebook/rocksdb/pull/5520 adds DBImpl:: wal_in_db_path_ and initializes it in DBImpl::Open, this PR fixes the valgrind error for secondary instance: ``` ==236417== Conditional jump or move depends on uninitialised value(s) ==236417== at 0x62242A: rocksdb::DeleteDBFile(rocksdb::ImmutableDBOptions const*, std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, bool, bool) (file_util.cc:96) ==236417== by 0x512432: rocksdb::DBImpl::DeleteObsoleteFileImpl(int, std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, rocksdb::FileType, unsigned long) (db_impl_files.cc:261) ==236417== by 0x515A7A: rocksdb::DBImpl::PurgeObsoleteFiles(rocksdb::JobContext&, bool) (db_impl_files.cc:492) ==236417== by 0x499153: rocksdb::ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() (column_family.cc:75) ==236417== by 0x499880: rocksdb::ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() (column_family.cc:84) ==236417== by 0x4C9AF9: rocksdb::DB::DestroyColumnFamilyHandle(rocksdb::ColumnFamilyHandle*) (db_impl.cc:3105) ==236417== by 0x44E853: CloseSecondary (db_secondary_test.cc:53) ==236417== by 0x44E853: rocksdb::DBSecondaryTest::~DBSecondaryTest() (db_secondary_test.cc:31) ==236417== by 0x44EC77: ~DBSecondaryTest_PrimaryDropColumnFamily_Test (db_secondary_test.cc:443) ==236417== by 0x44EC77: rocksdb::DBSecondaryTest_PrimaryDropColumnFamily_Test::~DBSecondaryTest_PrimaryDropColumnFamily_Test() (db_secondary_test.cc:443) ==236417== by 0x83D1D7: HandleSehExceptionsInMethodIfSupported (gtest-all.cc:3824) ==236417== by 0x83D1D7: void testing::internal::HandleExceptionsInMethodIfSupported(testing::Test*, void (testing::Test::*)(), char const*) (gtest-all.cc:3860) ==236417== by 0x8346DB: testing::TestInfo::Run() [clone .part.486] (gtest-all.cc:4078) ==236417== by 0x8348D4: Run (gtest-all.cc:4047) ==236417== by 0x8348D4: testing::TestCase::Run() [clone .part.487] (gtest-all.cc:4190) ==236417== by 0x834D14: Run (gtest-all.cc:6100) ==236417== by 0x834D14: testing::internal::UnitTestImpl::RunAllTests() (gtest-all.cc:6062) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5545 Differential Revision: D16146224 Pulled By: miasantreble fbshipit-source-id: 184c90e451352951da4e955f054d4b1a1f29ea29 --- db/db_impl/db_impl_secondary.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 8b93f675f8c..e14e53e55c3 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -587,6 +587,9 @@ Status DB::OpenAsSecondary( &impl->write_controller_)); impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = + IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->mutex_.Lock(); s = impl->Recover(column_families, true, false, false); if (s.ok()) { From 7c76a7fba271ed9023d9d7ed714ae2b519087fdf Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sun, 7 Jul 2019 22:40:52 -0700 Subject: [PATCH 207/572] Support GetAllKeyVersions() for non-default cf (#5544) Summary: Previously `GetAllKeyVersions()` supports default column family only. This PR add support for other column families. Test plan (devserver): ``` $make clean && COMPILE_WITH_ASAN=1 make -j32 db_basic_test $./db_basic_test --gtest_filter=DBBasicTest.GetAllKeyVersions ``` All other unit tests must pass. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5544 Differential Revision: D16147551 Pulled By: riversand963 fbshipit-source-id: 5a61aece2a32d789e150226a9b8d53f4a5760168 --- HISTORY.md | 1 + db/db_basic_test.cc | 50 +++++++++++++++++++++++++++++++ include/rocksdb/utilities/debug.h | 4 +++ utilities/debug.cc | 26 +++++++++++++--- 4 files changed, 77 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c425c578f87..d7eb51160ee 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ * Add C bindings for secondary instance, i.e. DBImplSecondary. * db_bench adds a "benchmark" stats_history, which prints out the whole stats history. * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path +* Overload GetAllKeyVersions() to support non-default column family. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 66d3b3aff7c..dc77fb91a9b 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -10,6 +10,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" #include "table/block_based/block_builder.h" #include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) @@ -1286,6 +1287,55 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { } } +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, GetAllKeyVersions) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + const size_t kNumInserts = 4; + const size_t kNumDeletes = 4; + const size_t kNumUpdates = 4; + + // Check default column family + for (size_t i = 0; i != kNumInserts; ++i) { + ASSERT_OK(Put(std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates; ++i) { + ASSERT_OK(Put(std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes; ++i) { + ASSERT_OK(Delete(std::to_string(i))); + } + std::vector key_versions; + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[0], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + + // Check non-default column family + for (size_t i = 0; i != kNumInserts - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes - 1; ++i) { + ASSERT_OK(Delete(1, std::to_string(i))); + } + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[1], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); +} +#endif // !ROCKSDB_LITE + class DBBasicTestWithParallelIO : public DBTestBase, public testing::WithParamInterface> { diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h index 50645423d0a..3fc414b6edf 100644 --- a/include/rocksdb/utilities/debug.h +++ b/include/rocksdb/utilities/debug.h @@ -40,6 +40,10 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, size_t max_num_ikeys, std::vector* key_versions); +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions); + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/debug.cc b/utilities/debug.cc index 8ddf64b5dc4..3c35f4c1122 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -14,16 +14,34 @@ namespace rocksdb { Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, size_t max_num_ikeys, std::vector* key_versions) { - assert(key_versions != nullptr); + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key, + max_num_ikeys, key_versions); +} + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions) { + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + if (nullptr == cfh) { + return Status::InvalidArgument("Column family handle cannot be null."); + } + if (nullptr == key_versions) { + return Status::InvalidArgument("key_versions cannot be null."); + } key_versions->clear(); DBImpl* idb = static_cast(db->GetRootDB()); - auto icmp = InternalKeyComparator(idb->GetOptions().comparator); + auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* upper_bound */); Arena arena; - ScopedArenaIterator iter( - idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber)); + ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg, + kMaxSequenceNumber, cfh)); if (!begin_key.empty()) { InternalKey ikey; From 6ca3feed5c5e3cb71a26f3aa58fdb46d64020c35 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 8 Jul 2019 00:09:44 -0700 Subject: [PATCH 208/572] Fix -Werror=shadow (#5546) Summary: This PR fixes shadow errors. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5546 Test Plan: make clean && make check -j32 && make clean && USE_CLANG=1 make check -j32 && make clean && COMPILE_WITH_ASAN=1 make check -j32 Differential Revision: D16147841 Pulled By: HaoyuHuang fbshipit-source-id: 1043500d70c134185f537ab4c3900452752f1534 --- table/block_based/block_based_table_reader.cc | 4 ++-- table/get_context.h | 2 +- utilities/simulator_cache/cache_simulator.cc | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index baa5c397eb7..26c1365c4e7 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -3341,7 +3341,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, CachableEntry filter_entry; bool may_match; FilterBlockReader* filter = nullptr; - uint64_t tracing_get_id = get_context->tracing_get_id(); + uint64_t tracing_get_id = get_context->get_tracing_get_id(); BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, tracing_get_id}; { @@ -3517,7 +3517,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, mget_range->end()); uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { - tracing_mget_id = sst_file_range.begin()->get_context->tracing_get_id(); + tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet, tracing_mget_id}; diff --git a/table/get_context.h b/table/get_context.h index f567229cc9f..7a37beb2df2 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -136,7 +136,7 @@ class GetContext { void ReportCounters(); - uint64_t tracing_get_id() const { return tracing_get_id_; } + uint64_t get_tracing_get_id() const { return tracing_get_id_; } private: const Comparator* ucmp_; diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index 145efdb6cba..65f626036b0 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -34,8 +34,8 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { double CacheSimulator::miss_ratio() { uint64_t hits = sim_cache_->get_hit_counter(); uint64_t misses = sim_cache_->get_miss_counter(); - uint64_t total_accesses = hits + misses; - return static_cast(misses * 100.0 / total_accesses); + uint64_t accesses = hits + misses; + return static_cast(misses * 100.0 / accesses); } uint64_t CacheSimulator::total_accesses() { From 872a261ffc2a440dfe9e60d99e421e42f5f2cf5e Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 8 Jul 2019 13:28:08 -0700 Subject: [PATCH 209/572] db_stress to print some internal keys after verification failure (#5543) Summary: Print out some more information when db_tress fails with verification failures to help debugging problems. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5543 Test Plan: Manually ingest some failures and observe the outputs are like this: Verification failed [default] 0000000000199A5A => 7C3D000078797A7B74757677707172736C6D6E6F68696A6B [6] 000000000019C8BD => 65380000616063626D6C6F6E69686B6A internal keys in default CF [0000000000199A5A, 000000000019C8BD] (max 8) key 0000000000199A5A seq 179246 type 1 key 000000000019C8BD seq 163970 type 1 Lastest Sequence Number: 292234 Differential Revision: D16153717 fbshipit-source-id: b33fa50a828c190cbf8249a37955432044f92daf --- tools/db_stress.cc | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 813f8068278..66a10d4f3a2 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -55,6 +55,7 @@ int main() { #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/debug.h" #include "rocksdb/utilities/options_util.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" @@ -4261,16 +4262,46 @@ class AtomicFlushStressTest : public StressTest { key = iters[i]->key(); value = iters[i]->value(); } else { - if (key.compare(iters[i]->key()) != 0) { + int cmp = key.compare(iters[i]->key()); + if (cmp != 0) { fprintf(stderr, "Verification failed\n"); - fprintf(stderr, "cf%s: %s => %s\n", + fprintf(stderr, "[%s] %s => %s\n", column_families_[0]->GetName().c_str(), key.ToString(true /* hex */).c_str(), - value.ToString(/* hex */).c_str()); - fprintf(stderr, "cf%s: %s => %s\n", + value.ToString(true /* hex */).c_str()); + fprintf(stderr, "[%s] %s => %s\n", column_families_[i]->GetName().c_str(), iters[i]->key().ToString(true /* hex */).c_str(), iters[i]->value().ToString(true /* hex */).c_str()); +#ifndef ROCKSDB_LITE + Slice begin_key; + Slice end_key; + if (cmp < 0) { + begin_key = key; + end_key = iters[i]->key(); + } else { + begin_key = iters[i]->key(); + end_key = key; + } + // We should print both of CF 0 and i but GetAllKeyVersions() now + // only supports default CF. + std::vector versions; + const size_t kMaxNumIKeys = 8; + Status s = GetAllKeyVersions(db_, begin_key, end_key, kMaxNumIKeys, + &versions); + fprintf(stderr, + "Internal keys in default CF [%s, %s] (max %" ROCKSDB_PRIszt + ")\n", + begin_key.ToString(true /* hex */).c_str(), + end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys); + for (const KeyVersion& kv : versions) { + fprintf(stderr, " key %s seq %" PRIu64 " type %d\n", + Slice(kv.user_key).ToString(true).c_str(), kv.sequence, + kv.type); + } +#endif // ROCKSDB_LITE + fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n", + db_->GetLatestSequenceNumber()); shared->SetVerificationFailure(); } } From a6a9213a367819bbe2c16b398f00f7dfa9b0dc18 Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Tue, 9 Jul 2019 10:47:31 -0700 Subject: [PATCH 210/572] Fix interpreter lines for files with python2-only syntax. Reviewed By: lisroach Differential Revision: D15362271 fbshipit-source-id: 48fab12ab6e55a8537b19b4623d2545ca9950ec5 --- coverage/parse_gcov_output.py | 1 + tools/db_crashtest.py | 2 +- tools/ldb_test.py | 1 + tools/write_stress_runner.py | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py index fbdabd96839..a5e98722202 100644 --- a/coverage/parse_gcov_output.py +++ b/coverage/parse_gcov_output.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import re import sys diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 173a6a8da9c..2a38d4c96d9 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os import sys diff --git a/tools/ldb_test.py b/tools/ldb_test.py index 26167ee83fd..4403379460b 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import os import glob diff --git a/tools/write_stress_runner.py b/tools/write_stress_runner.py index 9a0e920a724..fc0c99c235a 100644 --- a/tools/write_stress_runner.py +++ b/tools/write_stress_runner.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import subprocess import argparse From cb19e7411f17713adcfefbd45988dc6b18174914 Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 9 Jul 2019 11:01:12 -0700 Subject: [PATCH 211/572] Fix bugs in DBWALTest.kTolerateCorruptedTailRecords triggered by #5520 (#5550) Summary: https://github.com/facebook/rocksdb/pull/5520 caused a buffer overflow bug in DBWALTest.kTolerateCorruptedTailRecords. Fix it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5550 Test Plan: Run the test in UBSAN. It used to fail. Not it succeeds. Differential Revision: D16165516 fbshipit-source-id: 42c56a6bc64eb091f054b87757fcbef60da825f7 --- db/db_wal_test.cc | 4 +++- file/file_util.cc | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 4859bdc90f4..2d5e7bc1d53 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -824,7 +824,9 @@ class RecoveryTestHelper { // Create WAL files with values filled in static void FillData(DBWALTest* test, const Options& options, const size_t wal_count, size_t* count) { - const ImmutableDBOptions db_options(options); + // Calling internal functions requires sanitized options. + Options sanitized_options = SanitizeOptions(test->dbname_, options); + const ImmutableDBOptions db_options(sanitized_options); *count = 0; diff --git a/file/file_util.cc b/file/file_util.cc index 050d25da1a7..ee52bf640fb 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -110,6 +110,7 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) { bool same = false; + assert(!db_options->db_paths.empty()); Status s = db_options->env->AreFilesSame(db_options->wal_dir, db_options->db_paths[0].path, &same); if (s.IsNotSupported()) { From aa0367aabbb2ee891a4f7674351d8b10875670fa Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 9 Jul 2019 12:46:01 -0700 Subject: [PATCH 212/572] Allow ldb to open DB as secondary (#5537) Summary: Right now ldb can open running DB through read-only DB. However, it might leave info logs files to the read-only DB directory. Add an option to open the DB as secondary to avoid it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5537 Test Plan: Run ./ldb scan --max_keys=10 --db=/tmp/rocksdbtest-2491/dbbench --secondary_path=/tmp --no_value --hex and ./ldb get 0x00000000000000103030303030303030 --hex --db=/tmp/rocksdbtest-2491/dbbench --secondary_path=/tmp against a normal db_bench run and observe the output changes. Also observe that no new info logs files are created under /tmp/rocksdbtest-2491/dbbench. Run without --secondary_path and observe that new info logs created under /tmp/rocksdbtest-2491/dbbench. Differential Revision: D16113886 fbshipit-source-id: 4e09dec47c2528f6ca08a9e7a7894ba2d9daebbb --- HISTORY.md | 1 + include/rocksdb/utilities/ldb_cmd.h | 5 +++++ tools/ldb_cmd.cc | 29 +++++++++++++++++++++++++---- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d7eb51160ee..099c9f37e86 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,7 @@ * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. +* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index 57ab88a34eb..e7000742d1b 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -31,6 +31,7 @@ class LDBCommand { // Command-line arguments static const std::string ARG_DB; static const std::string ARG_PATH; + static const std::string ARG_SECONDARY_PATH; static const std::string ARG_HEX; static const std::string ARG_KEY_HEX; static const std::string ARG_VALUE_HEX; @@ -128,6 +129,10 @@ class LDBCommand { protected: LDBCommandExecuteResult exec_state_; std::string db_path_; + // If empty, open DB as primary. If non-empty, open the DB as secondary + // with this secondary path. When running against a database opened by + // another process, ldb wll leave the source directory completely intact. + std::string secondary_path_; std::string column_family_name_; DB* db_; DBWithTTL* db_ttl_; diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index fba32d9d622..8f4258cf36e 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -47,6 +47,7 @@ namespace rocksdb { const std::string LDBCommand::ARG_DB = "db"; const std::string LDBCommand::ARG_PATH = "path"; +const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path"; const std::string LDBCommand::ARG_HEX = "hex"; const std::string LDBCommand::ARG_KEY_HEX = "key_hex"; const std::string LDBCommand::ARG_VALUE_HEX = "value_hex"; @@ -321,6 +322,12 @@ LDBCommand::LDBCommand(const std::map& options, column_family_name_ = kDefaultColumnFamilyName; } + itr = options.find(ARG_SECONDARY_PATH); + secondary_path_ = ""; + if (itr != options.end()) { + secondary_path_ = itr->second; + } + is_key_hex_ = IsKeyHex(options, flags); is_value_hex_ = IsValueHex(options, flags); is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); @@ -360,6 +367,10 @@ void LDBCommand::OpenDB() { exec_state_ = LDBCommandExecuteResult::Failed( "ldb doesn't support TTL DB with multiple column families"); } + if (!secondary_path_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Open as secondary is not supported for TTL DB yet."); + } if (is_read_only_) { st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true); } else { @@ -382,7 +393,7 @@ void LDBCommand::OpenDB() { } } } - if (is_read_only_) { + if (is_read_only_ && secondary_path_.empty()) { if (column_families_.empty()) { st = DB::OpenForReadOnly(options_, db_path_, &db_); } else { @@ -391,10 +402,19 @@ void LDBCommand::OpenDB() { } } else { if (column_families_.empty()) { - st = DB::Open(options_, db_path_, &db_); + if (secondary_path_.empty()) { + st = DB::Open(options_, db_path_, &db_); + } else { + st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_); + } } else { - st = DB::Open(options_, db_path_, column_families_, &handles_opened, - &db_); + if (secondary_path_.empty()) { + st = DB::Open(options_, db_path_, column_families_, &handles_opened, + &db_); + } else { + st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, + column_families_, &handles_opened, &db_); + } } } } @@ -452,6 +472,7 @@ ColumnFamilyHandle* LDBCommand::GetCfHandle() { std::vector LDBCommand::BuildCmdLineOptions( std::vector options) { std::vector ret = {ARG_DB, + ARG_SECONDARY_PATH, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, ARG_AUTO_COMPACTION, From f786b4a5b4f1f162a7e7452b33e2e5cf0d755b9b Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 9 Jul 2019 12:57:02 -0700 Subject: [PATCH 213/572] Improve result print on atomic flush stress test failure (#5549) Summary: When atomic flush stress test fails, we print internal keys within the range with mismatched key/values for all column families. Test plan (on devserver) Manually hack the code to randomly insert wrong data. Run the test. ``` $make clean && COMPILE_WITH_TSAN=1 make -j32 db_stress $./db_stress -test_atomic_flush=true -ops_per_thread=10000 ``` Check that proper error messages are printed, as follows: ``` 2019/07/08-17:40:14 Starting verification Verification failed Latest Sequence Number: 190903 [default] 000000000000050B => 56290000525350515E5F5C5D5A5B5859 [3] 0000000000000533 => EE100000EAEBE8E9E6E7E4E5E2E3E0E1FEFFFCFDFAFBF8F9 Internal keys in CF 'default', [000000000000050B, 0000000000000533] (max 8) key 000000000000050B seq 139920 type 1 key 0000000000000533 seq 0 type 1 Internal keys in CF '3', [000000000000050B, 0000000000000533] (max 8) key 0000000000000533 seq 0 type 1 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5549 Differential Revision: D16158709 Pulled By: riversand963 fbshipit-source-id: f07fa87763f87b3bd908da03c956709c6456bcab --- tools/db_stress.cc | 77 +++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 66a10d4f3a2..3f767a9e76a 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -923,7 +923,8 @@ class SharedState { stress_test_(stress_test), verification_failure_(false), no_overwrite_ids_(FLAGS_column_families), - values_(nullptr) { + values_(nullptr), + printing_verification_results_(false) { // Pick random keys in each column family that will not experience // overwrite @@ -1204,6 +1205,16 @@ class SharedState { return expected_mmap_buffer_.get() != nullptr; } + bool PrintingVerificationResults() { + bool tmp = false; + return !printing_verification_results_.compare_exchange_strong( + tmp, true, std::memory_order_relaxed); + } + + void FinishPrintingVerificationResults() { + printing_verification_results_.store(false, std::memory_order_relaxed); + } + private: port::Mutex mu_; port::CondVar cv_; @@ -1231,6 +1242,7 @@ class SharedState { // and storing it in the container may require copying depending on the impl. std::vector > > key_locks_; std::unique_ptr expected_mmap_buffer_; + std::atomic printing_verification_results_; }; const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe; @@ -4235,6 +4247,7 @@ class AtomicFlushStressTest : public StressTest { } break; } else if (valid_cnt != iters.size()) { + shared->SetVerificationFailure(); for (size_t i = 0; i != num; ++i) { if (!iters[i]->Valid()) { if (statuses[i].ok()) { @@ -4250,13 +4263,19 @@ class AtomicFlushStressTest : public StressTest { column_families_[i]->GetName().c_str()); } } - shared->SetVerificationFailure(); + break; + } + if (shared->HasVerificationFailedYet()) { break; } // If the program reaches here, then all column families' iterators are // still valid. + if (shared->PrintingVerificationResults()) { + continue; + } Slice key; Slice value; + int num_mismatched_cfs = 0; for (size_t i = 0; i != num; ++i) { if (i == 0) { key = iters[i]->key(); @@ -4264,11 +4283,16 @@ class AtomicFlushStressTest : public StressTest { } else { int cmp = key.compare(iters[i]->key()); if (cmp != 0) { - fprintf(stderr, "Verification failed\n"); - fprintf(stderr, "[%s] %s => %s\n", - column_families_[0]->GetName().c_str(), - key.ToString(true /* hex */).c_str(), - value.ToString(true /* hex */).c_str()); + ++num_mismatched_cfs; + if (1 == num_mismatched_cfs) { + fprintf(stderr, "Verification failed\n"); + fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n", + db_->GetLatestSequenceNumber()); + fprintf(stderr, "[%s] %s => %s\n", + column_families_[0]->GetName().c_str(), + key.ToString(true /* hex */).c_str(), + value.ToString(true /* hex */).c_str()); + } fprintf(stderr, "[%s] %s => %s\n", column_families_[i]->GetName().c_str(), iters[i]->key().ToString(true /* hex */).c_str(), @@ -4283,29 +4307,38 @@ class AtomicFlushStressTest : public StressTest { begin_key = iters[i]->key(); end_key = key; } - // We should print both of CF 0 and i but GetAllKeyVersions() now - // only supports default CF. std::vector versions; const size_t kMaxNumIKeys = 8; - Status s = GetAllKeyVersions(db_, begin_key, end_key, kMaxNumIKeys, - &versions); - fprintf(stderr, - "Internal keys in default CF [%s, %s] (max %" ROCKSDB_PRIszt - ")\n", - begin_key.ToString(true /* hex */).c_str(), - end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys); - for (const KeyVersion& kv : versions) { - fprintf(stderr, " key %s seq %" PRIu64 " type %d\n", - Slice(kv.user_key).ToString(true).c_str(), kv.sequence, - kv.type); + const auto print_key_versions = [&](ColumnFamilyHandle* cfh) { + Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key, + kMaxNumIKeys, &versions); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); + return; + } + assert(nullptr != cfh); + fprintf(stderr, + "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt + ")\n", + cfh->GetName().c_str(), + begin_key.ToString(true /* hex */).c_str(), + end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys); + for (const KeyVersion& kv : versions) { + fprintf(stderr, " key %s seq %" PRIu64 " type %d\n", + Slice(kv.user_key).ToString(true).c_str(), kv.sequence, + kv.type); + } + }; + if (1 == num_mismatched_cfs) { + print_key_versions(column_families_[0]); } + print_key_versions(column_families_[i]); #endif // ROCKSDB_LITE - fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n", - db_->GetLatestSequenceNumber()); shared->SetVerificationFailure(); } } } + shared->FinishPrintingVerificationResults(); for (auto& iter : iters) { iter->Next(); } From 60d8b19836745ce01deed59138802a3aa75bc488 Mon Sep 17 00:00:00 2001 From: ggaurav28 <51927531+ggaurav28@users.noreply.github.com> Date: Tue, 9 Jul 2019 14:48:07 -0700 Subject: [PATCH 214/572] Implemented a file logger that uses WritableFileWriter (#5491) Summary: Current PosixLogger performs IO operations using posix calls. Thus the current implementation will not work for non-posix env. Created a new logger class EnvLogger that uses env specific WritableFileWriter for IO operations. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5491 Test Plan: make check Differential Revision: D15909002 Pulled By: ggaurav28 fbshipit-source-id: 13a8105176e8e42db0c59798d48cb6a0dbccc965 --- CMakeLists.txt | 1 + Makefile | 4 + TARGETS | 5 + env/env.cc | 21 ++++ env/env_posix.cc | 7 +- env/io_posix.cc | 1 - include/rocksdb/env.h | 12 ++- logging/auto_roll_logger_test.cc | 24 +---- logging/env_logger.h | 165 +++++++++++++++++++++++++++++++ logging/env_logger_test.cc | 164 ++++++++++++++++++++++++++++++ src.mk | 1 + test_util/testutil.cc | 18 ++++ test_util/testutil.h | 3 + util/file_reader_writer.h | 2 +- 14 files changed, 400 insertions(+), 28 deletions(-) create mode 100644 logging/env_logger.h create mode 100644 logging/env_logger_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ca338bd63f..50e082662b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -949,6 +949,7 @@ if(WITH_TESTS) env/mock_env_test.cc file/delete_scheduler_test.cc logging/auto_roll_logger_test.cc + logging/env_logger_test.cc logging/event_logger_test.cc memory/arena_test.cc memtable/inlineskiplist_test.cc diff --git a/Makefile b/Makefile index b0b52a37365..f1834e0ecf9 100644 --- a/Makefile +++ b/Makefile @@ -432,6 +432,7 @@ TESTS = \ inlineskiplist_test \ env_basic_test \ env_test \ + env_logger_test \ hash_test \ thread_local_test \ rate_limiter_test \ @@ -1529,6 +1530,9 @@ filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 3935f1f740d..82e1d375d96 100644 --- a/TARGETS +++ b/TARGETS @@ -368,6 +368,11 @@ ROCKS_TESTS = [ "logging/auto_roll_logger_test.cc", "serial", ], + [ + "env_logger_test", + "logging/env_logger_test.cc", + "serial", + ], [ "autovector_test", "util/autovector_test.cc", diff --git a/env/env.cc b/env/env.cc index e5e0e99c0a0..87b6b35c16c 100644 --- a/env/env.cc +++ b/env/env.cc @@ -10,6 +10,7 @@ #include "rocksdb/env.h" #include +#include "logging/env_logger.h" #include "memory/arena.h" #include "options/db_options.h" #include "port/port.h" @@ -22,6 +23,11 @@ namespace rocksdb { Env::~Env() { } +Status Env::NewLogger(const std::string& fname, + std::shared_ptr* result) { + return NewEnvLogger(fname, this, result); +} + std::string Env::PriorityToString(Env::Priority priority) { switch (priority) { case Env::Priority::BOTTOM: @@ -422,5 +428,20 @@ EnvOptions::EnvOptions() { AssignEnvOptions(this, options); } +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result) { + EnvOptions options; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr writable_file; + const auto status = env->NewWritableFile(fname, &writable_file, options); + if (!status.ok()) { + return status; + } + + *result = std::make_shared(std::move(writable_file), fname, + options, env); + return Status::OK(); +} } // namespace rocksdb diff --git a/env/env_posix.cc b/env/env_posix.cc index c0edb00968e..7f7f6b2df5b 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -887,13 +887,14 @@ class PosixEnv : public Env { FILE* f; { IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), "w" + f = fopen(fname.c_str(), + "w" #ifdef __GLIBC_PREREQ #if __GLIBC_PREREQ(2, 7) - "e" // glibc extension to enable O_CLOEXEC + "e" // glibc extension to enable O_CLOEXEC #endif #endif - ); + ); } if (f == nullptr) { result->reset(); diff --git a/env/io_posix.cc b/env/io_posix.cc index 304c4ffe1c7..293516feee8 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -27,7 +27,6 @@ #include #include #endif -#include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" #include "rocksdb/slice.h" diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index ba8978dc810..67464cc5c55 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -395,9 +395,11 @@ class Env { // same directory. virtual Status GetTestDirectory(std::string* path) = 0; - // Create and return a log file for storing informational messages. + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can overide to provide custom + // logger. virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result) = 0; + std::shared_ptr* result); // Returns the number of micro-seconds since some fixed point in time. // It is often used as system time such as in GenericRateLimiter @@ -1563,4 +1565,10 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); // This is a factory method for TimedEnv defined in utilities/env_timed.cc. Env* NewTimedEnv(Env* base_env); +// Returns an instance of logger that can be used for storing informational +// messages. +// This is a factory method for EnvLogger declared in logging/env_logging.h +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result); + } // namespace rocksdb diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index cce98d374ef..fa668114cfb 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -22,6 +22,7 @@ #include "rocksdb/db.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { namespace { @@ -444,7 +445,7 @@ TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}}); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); }); + flush_thread = port::Thread([&]() { auto_roll_logger->Flush(); }); TEST_SYNC_POINT( "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"); RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, @@ -557,25 +558,6 @@ static std::vector GetOldFileNames(const std::string& path) { return ret; } -// Return the number of lines where a given pattern was found in the file -static size_t GetLinesCount(const std::string& fname, - const std::string& pattern) { - std::stringstream ssbuf; - std::string line; - size_t count = 0; - - std::ifstream inFile(fname.c_str()); - ssbuf << inFile.rdbuf(); - - while (getline(ssbuf, line)) { - if (line.find(pattern) != std::string::npos) { - count++; - } - } - - return count; -} - TEST_F(AutoRollLoggerTest, LogHeaderTest) { static const size_t MAX_HEADERS = 10; static const size_t LOG_MAX_SIZE = 1024 * 5; @@ -627,7 +609,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { // verify that the files rolled over ASSERT_NE(oldfname, newfname); // verify that the old log contains all the header logs - ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS); + ASSERT_EQ(test::GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS); } } } diff --git a/logging/env_logger.h b/logging/env_logger.h new file mode 100644 index 00000000000..94cf129228c --- /dev/null +++ b/logging/env_logger.h @@ -0,0 +1,165 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that uses custom Env object for logging. + +#pragma once + +#include +#include +#include "port/sys_time.h" +#include + +#include "monitoring/iostats_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/file_reader_writer.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +class EnvLogger : public Logger { + public: + EnvLogger(std::unique_ptr&& writable_file, + const std::string& fname, const EnvOptions& options, Env* env, + InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(std::move(writable_file), fname, options, env), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + + ~EnvLogger() { + if (!closed_) { + closed_ = true; + CloseHelper(); + } + } + + private: + void FlushLocked() { + mutex_.AssertHeld(); + if (flush_pending_) { + flush_pending_ = false; + file_.Flush(); + } + last_flush_micros_ = env_->NowMicros(); + } + + void Flush() override { + TEST_SYNC_POINT("EnvLogger::Flush:Begin1"); + TEST_SYNC_POINT("EnvLogger::Flush:Begin2"); + + MutexLock l(&mutex_); + FlushLocked(); + } + + Status CloseImpl() override { return CloseHelper(); } + + Status CloseHelper() { + mutex_.Lock(); + const auto close_status = file_.Close(); + mutex_.Unlock(); + + if (close_status.ok()) { + return close_status; + } + return Status::IOError("Close of log file failed with error:" + + (close_status.getState() + ? std::string(close_status.getState()) + : std::string())); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + IOSTATS_TIMER_GUARD(logger_nanos); + + const uint64_t thread_id = env_->GetThreadID(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 65536; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + mutex_.Lock(); + // We will ignore any error returned by Append(). + file_.Append(Slice(base, p - base)); + flush_pending_ = true; + const uint64_t now_micros = env_->NowMicros(); + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + FlushLocked(); + } + mutex_.Unlock(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + size_t GetLogFileSize() const override { + MutexLock l(&mutex_); + return file_.GetFileSize(); + } + + private: + WritableFileWriter file_; + mutable port::Mutex mutex_; // Mutex to protect the shared variables below. + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + std::atomic flush_pending_; +}; + +} // namespace rocksdb diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc new file mode 100644 index 00000000000..316c231fad9 --- /dev/null +++ b/logging/env_logger_test.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "env/mock_env.h" +#include "logging/env_logger.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace rocksdb { + +namespace { +// In this test we only want to Log some simple log message with +// no format. +void LogMessage(std::shared_ptr logger, const std::string& message) { + Log(logger, "%s", message.c_str()); +} + +// Helper method to write the message num_times in the given logger. +void WriteLogs(std::shared_ptr logger, const std::string& message, + int num_times) { + for (int ii = 0; ii < num_times; ++ii) { + LogMessage(logger, message); + } +} + +} // namespace + +class EnvLoggerTest : public testing::Test { + public: + Env* env_; + + EnvLoggerTest() : env_(Env::Default()) {} + + ~EnvLoggerTest() = default; + + std::shared_ptr CreateLogger() { + std::shared_ptr result; + assert(NewEnvLogger(kLogFile, env_, &result).ok()); + assert(result); + result->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + return result; + } + + void DeleteLogFile() { + ASSERT_OK(env_->DeleteFile(kLogFile)); + } + + static const std::string kSampleMessage; + static const std::string kTestDir; + static const std::string kLogFile; +}; + +const std::string EnvLoggerTest::kSampleMessage = + "this is the message to be written to the log file!!"; +const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file"); + +TEST_F(EnvLoggerTest, EmptyLogFile) { + auto logger = CreateLogger(); + ASSERT_EQ(logger->Close(), Status::OK()); + + // Check the size of the log file. + uint64_t file_size; + ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_EQ(file_size, 0); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, LogMultipleLines) { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + // Flush the logs. + logger->Flush(); + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, Overwrite) { + { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + } + + // Now reopen the file again. + { + auto logger = CreateLogger(); + + // File should be empty. + uint64_t file_size; + ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_EQ(file_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), 0); + ASSERT_EQ(logger->Close(), Status::OK()); + } + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, Close) { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, ConcurrentLogging) { + auto logger = CreateLogger(); + + const int kNumIter = 20; + std::function cb = [&]() { + WriteLogs(logger, kSampleMessage, kNumIter); + logger->Flush(); + }; + + // Write to the logs from multiple threads. + std::vector threads; + const int kNumThreads = 5; + // Create threads. + for (int ii = 0; ii < kNumThreads; ++ii) { + threads.push_back(port::Thread(cb)); + } + + // Wait for them to complete. + for (auto& th : threads) { + th.join(); + } + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Verfiy the log file. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), + kNumIter * kNumThreads); + DeleteLogFile(); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src.mk b/src.mk index 7c35ee67589..8b0122dbe22 100644 --- a/src.mk +++ b/src.mk @@ -345,6 +345,7 @@ MAIN_SOURCES = \ env/env_test.cc \ env/mock_env_test.cc \ logging/auto_roll_logger_test.cc \ + logging/env_logger_test.cc \ logging/event_logger_test.cc \ memory/arena_test.cc \ memtable/inlineskiplist_test.cc \ diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 61a49d88a17..46f878f8ce5 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -11,6 +11,7 @@ #include #include +#include #include #include "db/memtable_list.h" @@ -426,5 +427,22 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) { return s.ok(); } +size_t GetLinesCount(const std::string& fname, const std::string& pattern) { + std::stringstream ssbuf; + std::string line; + size_t count = 0; + + std::ifstream inFile(fname.c_str()); + ssbuf << inFile.rdbuf(); + + while (getline(ssbuf, line)) { + if (line.find(pattern) != std::string::npos) { + count++; + } + } + + return count; +} + } // namespace test } // namespace rocksdb diff --git a/test_util/testutil.h b/test_util/testutil.h index bc0b2b07d5f..bb732ff3a5a 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -750,5 +750,8 @@ Status DestroyDir(Env* env, const std::string& dir); bool IsDirectIOSupported(Env* env, const std::string& dir); +// Return the number of lines where a given pattern was found in a file. +size_t GetLinesCount(const std::string& fname, const std::string& pattern); + } // namespace test } // namespace rocksdb diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 0a7e5032d2f..0c5089d0758 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -282,7 +282,7 @@ class WritableFileWriter { // returns NotSupported status. Status SyncWithoutFlush(bool use_fsync); - uint64_t GetFileSize() { return filesize_; } + uint64_t GetFileSize() const { return filesize_; } Status InvalidateCache(size_t offset, size_t length) { return writable_file_->InvalidateCache(offset, length); From 82d8ca8ade08b2c26acad33d954ba0b4cd770e2d Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 10 Jul 2019 11:26:22 -0700 Subject: [PATCH 215/572] Upload db directory during cleanup for certain tests (#5554) Summary: Add an extra cleanup step so that db directory can be saved and uploaded. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5554 Reviewed By: yancouto Differential Revision: D16168844 Pulled By: riversand963 fbshipit-source-id: ec7b2cee5f11c7d388c36531f8b076d648e2fb19 --- build_tools/rocksdb-lego-determinator | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index dc32b3af9ff..af86a16c2be 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -63,6 +63,21 @@ CLEANUP_ENV=" 'user':'root' }" +UPLOAD_DB_DIR=" +{ + 'name':'Upload database directory', + 'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/', + 'user':'root', + 'cleanup':true, + 'provide_artifacts': [ + { + 'name':'rocksdb_db_dir', + 'paths': ['rocksdb_db.tar.gz'], + 'bundle': false, + }, + ], +}" + # We will eventually set the RATIO to 1, but we want do this # in steps. RATIO=$(nproc) will make it work as J=1 if [ -z $RATIO ]; then @@ -428,7 +443,8 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER - } + }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -519,6 +535,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -586,6 +603,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -678,6 +696,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } From 1a59b6e2a97c9933d323bdeb379bb72c43dfc41c Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 11 Jul 2019 12:40:08 -0700 Subject: [PATCH 216/572] Cache simulator: Add a ghost cache for admission control and a hybrid row-block cache. (#5534) Summary: This PR adds a ghost cache for admission control. Specifically, it admits an entry on its second access. It also adds a hybrid row-block cache that caches the referenced key-value pairs of a Get/MultiGet request instead of its blocks. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5534 Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32 Differential Revision: D16101124 Pulled By: HaoyuHuang fbshipit-source-id: b99edda6418a888e94eb40f71ece45d375e234b1 --- CMakeLists.txt | 1 + Makefile | 4 + TARGETS | 5 + src.mk | 1 + tools/block_cache_trace_analyzer.cc | 29 +- tools/block_cache_trace_analyzer_test.cc | 11 +- trace_replay/block_cache_tracer.cc | 8 + trace_replay/block_cache_tracer.h | 1 + utilities/simulator_cache/cache_simulator.cc | 212 +++++++++-- utilities/simulator_cache/cache_simulator.h | 126 ++++++- .../simulator_cache/cache_simulator_test.cc | 337 ++++++++++++++++++ 11 files changed, 684 insertions(+), 51 deletions(-) create mode 100644 utilities/simulator_cache/cache_simulator_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 50e082662b5..c47f9811ef2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1006,6 +1006,7 @@ if(WITH_TESTS) utilities/options/options_util_test.cc utilities/persistent_cache/hash_table_test.cc utilities/persistent_cache/persistent_cache_test.cc + utilities/simulator_cache/cache_simulator_test.cc utilities/simulator_cache/sim_cache_test.cc utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc diff --git a/Makefile b/Makefile index f1834e0ecf9..1828b833b02 100644 --- a/Makefile +++ b/Makefile @@ -510,6 +510,7 @@ TESTS = \ cassandra_serialize_test \ ttl_test \ backupable_db_test \ + cache_simulator_test \ sim_cache_test \ version_edit_test \ version_set_test \ @@ -1321,6 +1322,9 @@ backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TE checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 82e1d375d96..6ef3da179dc 100644 --- a/TARGETS +++ b/TARGETS @@ -423,6 +423,11 @@ ROCKS_TESTS = [ "cache/cache_test.cc", "serial", ], + [ + "cache_simulator_test", + "utilities/simulator_cache/cache_simulator_test.cc", + "serial", + ], [ "cassandra_format_test", "utilities/cassandra/cassandra_format_test.cc", diff --git a/src.mk b/src.mk index 8b0122dbe22..bc49b7ce074 100644 --- a/src.mk +++ b/src.mk @@ -405,6 +405,7 @@ MAIN_SOURCES = \ utilities/object_registry_test.cc \ utilities/option_change_migration/option_change_migration_test.cc \ utilities/options/options_util_test.cc \ + utilities/simulator_cache/cache_simulator_test.cc \ utilities/simulator_cache/sim_cache_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 4770348a79d..bd8d8971bfc 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -23,9 +23,12 @@ DEFINE_string( block_cache_sim_config_path, "", "The config file path. One cache configuration per line. The format of a " "cache configuration is " - "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. " - "cache_name is lru or lru_priority. cache_capacity can be xK, xM or xG " - "where x is a positive number."); + "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_" + "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and " + "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to " + "a cache_name to add a ghost cache in front of the real cache. " + "ghost_capacity and cache_capacity can be xK, xM or xG where x is a " + "positive number."); DEFINE_int32(block_cache_trace_downsample_ratio, 1, "The trace collected accesses on one in every " "block_cache_trace_downsample_ratio blocks. We scale " @@ -104,6 +107,10 @@ const std::string kGroupbyAll = "all"; const std::set kGroupbyLabels{ kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel, kGroupbyBlockType, kGroupbyCaller, kGroupbyAll}; +const std::string kSupportedCacheNames = + " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid " + "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss " + "ghost_lru_hybrid_no_insert_on_row_miss "; std::string block_type_to_string(TraceType type) { switch (type) { @@ -194,7 +201,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { } // Write header. const std::string header = - "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"; + "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_" + "accesses"; out << header << std::endl; for (auto const& config_caches : cache_simulator_->sim_caches()) { const CacheConfiguration& config = config_caches.first; @@ -205,6 +213,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { out << ","; out << config.num_shard_bits; out << ","; + out << config.ghost_cache_capacity; + out << ","; out << config.cache_capacities[i]; out << ","; out << std::fixed << std::setprecision(4) << miss_ratio; @@ -993,18 +1003,21 @@ std::vector parse_cache_config_file( config_strs.push_back(substr); } // Sanity checks. - if (config_strs.size() < 3) { + if (config_strs.size() < 4) { fprintf(stderr, "Invalid cache simulator configuration %s\n", line.c_str()); exit(1); } - if (config_strs[0] != "lru") { - fprintf(stderr, "We only support LRU cache %s\n", line.c_str()); + if (kSupportedCacheNames.find(" " + config_strs[0] + " ") == + std::string::npos) { + fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n", + line.c_str(), kSupportedCacheNames.c_str()); exit(1); } cache_config.cache_name = config_strs[0]; cache_config.num_shard_bits = ParseUint32(config_strs[1]); - for (uint32_t i = 2; i < config_strs.size(); i++) { + cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]); + for (uint32_t i = 3; i < config_strs.size(); i++) { uint64_t capacity = ParseUint64(config_strs[i]); if (capacity == 0) { fprintf(stderr, "Invalid cache capacity %s, %s\n", diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index 21d8bcbbb3f..efb202cb4ab 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -205,7 +205,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } { // Generate a cache sim config. - std::string config = "lru,1,1K,1M,1G"; + std::string config = "lru,1,0,1K,1M,1G"; std::ofstream out(block_cache_sim_config_path_); ASSERT_TRUE(out.is_open()); out << config << std::endl; @@ -230,14 +230,15 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { getline(ss, substr, ','); result_strs.push_back(substr); } - ASSERT_EQ(5, result_strs.size()); + ASSERT_EQ(6, result_strs.size()); ASSERT_LT(config_index, expected_capacities.size()); ASSERT_EQ("lru", result_strs[0]); // cache_name ASSERT_EQ("1", result_strs[1]); // num_shard_bits + ASSERT_EQ("0", result_strs[2]); // ghost_cache_capacity ASSERT_EQ(std::to_string(expected_capacities[config_index]), - result_strs[2]); // cache_capacity - ASSERT_EQ("100.0000", result_strs[3]); // miss_ratio - ASSERT_EQ("50", result_strs[4]); // number of accesses. + result_strs[3]); // cache_capacity + ASSERT_EQ("100.0000", result_strs[4]); // miss_ratio + ASSERT_EQ("50", result_strs[5]); // number of accesses. config_index++; } ASSERT_EQ(expected_capacities.size(), config_index); diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 115a75d924b..62db942044c 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -45,6 +45,14 @@ bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) { caller == TableReaderCaller::kUserMultiGet; } +bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) { + return caller == TableReaderCaller::kUserGet || + caller == TableReaderCaller::kUserMultiGet || + caller == TableReaderCaller::kUserIterator || + caller == TableReaderCaller::kUserApproximateSize || + caller == TableReaderCaller::kUserVerifyChecksum; +} + BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, std::unique_ptr&& trace_writer) diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 4788a3f447f..66cbb5adefa 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -23,6 +23,7 @@ class BlockCacheTraceHelper { static bool ShouldTraceReferencedKey(TraceType block_type, TableReaderCaller caller); static bool ShouldTraceGetId(TableReaderCaller caller); + static bool IsUserAccess(TableReaderCaller caller); static const std::string kUnknownColumnFamilyName; static const uint64_t kReservedGetId; diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index 65f626036b0..ebfc4cd0eb0 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -4,42 +4,177 @@ // (found in the LICENSE.Apache file in the root directory). #include "utilities/simulator_cache/cache_simulator.h" +#include "db/dbformat.h" namespace rocksdb { -CacheSimulator::CacheSimulator(std::shared_ptr sim_cache) + +namespace { +const std::string kGhostCachePrefix = "ghost_"; +} + +GhostCache::GhostCache(std::shared_ptr sim_cache) : sim_cache_(sim_cache) {} +bool GhostCache::Admit(const Slice& lookup_key) { + auto handle = sim_cache_->Lookup(lookup_key); + if (handle != nullptr) { + sim_cache_->Release(handle); + return true; + } + sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(), + /*deleter=*/nullptr, /*handle=*/nullptr); + return false; +} + +CacheSimulator::CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {} + void CacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool admit = true; + const bool is_user_access = + BlockCacheTraceHelper::IsUserAccess(access.caller); + bool is_cache_miss = true; + if (ghost_cache_ && access.no_insert == Boolean::kFalse) { + admit = ghost_cache_->Admit(access.block_key); + } auto handle = sim_cache_->Lookup(access.block_key); - if (handle == nullptr && !access.no_insert) { - sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, - /*deleter=*/nullptr, /*handle=*/nullptr); + if (handle != nullptr) { + sim_cache_->Release(handle); + is_cache_miss = false; + } else { + if (access.no_insert == Boolean::kFalse && admit) { + sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, + /*deleter=*/nullptr, /*handle=*/nullptr); + } } + UpdateMetrics(is_user_access, is_cache_miss); } -void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { - auto handle = sim_cache_->Lookup(access.block_key); - if (handle == nullptr && !access.no_insert) { - Cache::Priority priority = Cache::Priority::LOW; - if (access.block_type == TraceType::kBlockTraceFilterBlock || - access.block_type == TraceType::kBlockTraceIndexBlock || - access.block_type == TraceType::kBlockTraceUncompressionDictBlock) { - priority = Cache::Priority::HIGH; +void CacheSimulator::UpdateMetrics(bool is_user_access, bool is_cache_miss) { + num_accesses_ += 1; + if (is_cache_miss) { + num_misses_ += 1; + } + if (is_user_access) { + user_accesses_ += 1; + if (is_cache_miss) { + user_misses_ += 1; } - sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, + } +} + +Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority( + const BlockCacheTraceRecord& access) const { + if (access.block_type == TraceType::kBlockTraceFilterBlock || + access.block_type == TraceType::kBlockTraceIndexBlock || + access.block_type == TraceType::kBlockTraceUncompressionDictBlock) { + return Cache::Priority::HIGH; + } + return Cache::Priority::LOW; +} + +void PrioritizedCacheSimulator::AccessKVPair( + const Slice& key, uint64_t value_size, Cache::Priority priority, + bool no_insert, bool is_user_access, bool* is_cache_miss, bool* admitted, + bool update_metrics) { + assert(is_cache_miss); + assert(admitted); + *is_cache_miss = true; + *admitted = true; + if (ghost_cache_ && !no_insert) { + *admitted = ghost_cache_->Admit(key); + } + auto handle = sim_cache_->Lookup(key); + if (handle != nullptr) { + sim_cache_->Release(handle); + *is_cache_miss = false; + } else if (!no_insert && *admitted && value_size > 0) { + sim_cache_->Insert(key, /*value=*/nullptr, value_size, /*deleter=*/nullptr, /*handle=*/nullptr, priority); } + if (update_metrics) { + UpdateMetrics(is_user_access, *is_cache_miss); + } } -double CacheSimulator::miss_ratio() { - uint64_t hits = sim_cache_->get_hit_counter(); - uint64_t misses = sim_cache_->get_miss_counter(); - uint64_t accesses = hits + misses; - return static_cast(misses * 100.0 / accesses); +void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool is_cache_miss = true; + bool admitted = true; + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); } -uint64_t CacheSimulator::total_accesses() { - return sim_cache_->get_hit_counter() + sim_cache_->get_miss_counter(); +std::string HybridRowBlockCacheSimulator::ComputeRowKey( + const BlockCacheTraceRecord& access) { + assert(access.get_id != BlockCacheTraceHelper::kReservedGetId); + Slice key; + if (access.referenced_key_exist_in_block == Boolean::kTrue) { + key = ExtractUserKey(access.referenced_key); + } else { + key = access.referenced_key; + } + return std::to_string(access.sst_fd_number) + "_" + key.ToString(); +} + +void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool is_cache_miss = true; + bool admitted = true; + if (access.get_id != BlockCacheTraceHelper::kReservedGetId) { + // This is a Get/MultiGet request. + const std::string& row_key = ComputeRowKey(access); + if (getid_getkeys_map_[access.get_id].find(row_key) == + getid_getkeys_map_[access.get_id].end()) { + // This is the first time that this key is accessed. Look up the key-value + // pair first. Do not update the miss/accesses metrics here since it will + // be updated later. + AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH, + /*no_insert=*/false, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/false); + InsertResult result = InsertResult::NO_INSERT; + if (admitted && access.referenced_data_size > 0) { + result = InsertResult::INSERTED; + } else if (admitted) { + result = InsertResult::ADMITTED; + } + getid_getkeys_map_[access.get_id][row_key] = + std::make_pair(is_cache_miss, result); + } + std::pair miss_inserted = + getid_getkeys_map_[access.get_id][row_key]; + if (!miss_inserted.first) { + // This is a cache hit. Skip future accesses to its index/filter/data + // blocks. These block lookups are unnecessary if we observe a hit for the + // referenced key-value pair already. Thus, we treat these lookups as + // hits. This is also to ensure the total number of accesses are the same + // when comparing to other policies. + UpdateMetrics(/*is_user_access=*/true, /*is_cache_miss=*/false); + return; + } + // The key-value pair observes a cache miss. We need to access its + // index/filter/data blocks. + AccessKVPair( + access.block_key, access.block_type, ComputeBlockPriority(access), + /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/true); + if (access.referenced_data_size > 0 && + miss_inserted.second == InsertResult::ADMITTED) { + sim_cache_->Insert( + row_key, /*value=*/nullptr, access.referenced_data_size, + /*deleter=*/nullptr, /*handle=*/nullptr, Cache::Priority::HIGH); + getid_getkeys_map_[access.get_id][row_key] = + std::make_pair(true, InsertResult::INSERTED); + } + return; + } + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); } BlockCacheTraceSimulator::BlockCacheTraceSimulator( @@ -56,18 +191,41 @@ Status BlockCacheTraceSimulator::InitializeCaches() { // 1/'downsample_ratio' blocks. uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_; std::shared_ptr sim_cache; - if (config.cache_name == "lru") { - sim_cache = std::make_shared(NewSimCache( + std::unique_ptr ghost_cache; + std::string cache_name = config.cache_name; + if (cache_name.find(kGhostCachePrefix) != std::string::npos) { + ghost_cache.reset(new GhostCache( + NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_name = cache_name.substr(kGhostCachePrefix.size()); + } + if (cache_name == "lru") { + sim_cache = std::make_shared( + std::move(ghost_cache), NewLRUCache(simulate_cache_capacity, config.num_shard_bits, /*strict_capacity_limit=*/false, - /*high_pri_pool_ratio=*/0), - /*real_cache=*/nullptr, config.num_shard_bits)); - } else if (config.cache_name == "lru_priority") { - sim_cache = std::make_shared(NewSimCache( + /*high_pri_pool_ratio=*/0)); + } else if (cache_name == "lru_priority") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5)); + } else if (cache_name == "lru_hybrid") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*insert_blocks_upon_row_kvpair_miss=*/true); + } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") { + sim_cache = std::make_shared( + std::move(ghost_cache), NewLRUCache(simulate_cache_capacity, config.num_shard_bits, /*strict_capacity_limit=*/false, /*high_pri_pool_ratio=*/0.5), - /*real_cache=*/nullptr, config.num_shard_bits)); + /*insert_blocks_upon_row_kvpair_miss=*/false); } else { // Not supported. return Status::InvalidArgument("Unknown cache name " + diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h index b391d5dc8a5..b6667eeed12 100644 --- a/utilities/simulator_cache/cache_simulator.h +++ b/utilities/simulator_cache/cache_simulator.h @@ -5,7 +5,6 @@ #pragma once -#include "rocksdb/utilities/sim_cache.h" #include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -14,22 +13,46 @@ namespace rocksdb { struct CacheConfiguration { std::string cache_name; // LRU. uint32_t num_shard_bits; + uint64_t ghost_cache_capacity; // ghost cache capacity in bytes. std::vector cache_capacities; // simulate cache capacities in bytes. - bool operator=(const CacheConfiguration& o) const { - return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits; + bool operator==(const CacheConfiguration& o) const { + return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity == o.ghost_cache_capacity; } bool operator<(const CacheConfiguration& o) const { return cache_name < o.cache_name || - (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits); + (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) || + (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity < o.ghost_cache_capacity); } }; +// A ghost cache admits an entry on its second access. +class GhostCache { + public: + explicit GhostCache(std::shared_ptr sim_cache); + ~GhostCache() = default; + // No copy and move. + GhostCache(const GhostCache&) = delete; + GhostCache& operator=(const GhostCache&) = delete; + GhostCache(GhostCache&&) = delete; + GhostCache& operator=(GhostCache&&) = delete; + + // Returns true if the lookup_key is in the ghost cache. + // Returns false otherwise. + bool Admit(const Slice& lookup_key); + + private: + std::shared_ptr sim_cache_; +}; + // A cache simulator that runs against a block cache trace. class CacheSimulator { public: - CacheSimulator(std::shared_ptr sim_cache); + CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache); virtual ~CacheSimulator() = default; // No copy and move. CacheSimulator(const CacheSimulator&) = delete; @@ -38,12 +61,37 @@ class CacheSimulator { CacheSimulator& operator=(CacheSimulator&&) = delete; virtual void Access(const BlockCacheTraceRecord& access); - void reset_counter() { sim_cache_->reset_counter(); } - double miss_ratio(); - uint64_t total_accesses(); + void reset_counter() { + num_misses_ = 0; + num_accesses_ = 0; + user_accesses_ = 0; + user_misses_ = 0; + } + double miss_ratio() const { + if (num_accesses_ == 0) { + return -1; + } + return static_cast(num_misses_ * 100.0 / num_accesses_); + } + uint64_t total_accesses() const { return num_accesses_; } + + double user_miss_ratio() const { + if (user_accesses_ == 0) { + return -1; + } + return static_cast(user_misses_ * 100.0 / user_accesses_); + } + uint64_t user_accesses() const { return user_accesses_; } protected: - std::shared_ptr sim_cache_; + void UpdateMetrics(bool is_user_access, bool is_cache_miss); + + std::unique_ptr ghost_cache_; + std::shared_ptr sim_cache_; + uint64_t num_accesses_ = 0; + uint64_t num_misses_ = 0; + uint64_t user_accesses_ = 0; + uint64_t user_misses_ = 0; }; // A prioritized cache simulator that runs against a block cache trace. @@ -51,9 +99,65 @@ class CacheSimulator { // priority in the cache. class PrioritizedCacheSimulator : public CacheSimulator { public: - PrioritizedCacheSimulator(std::shared_ptr sim_cache) - : CacheSimulator(sim_cache) {} + PrioritizedCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : CacheSimulator(std::move(ghost_cache), sim_cache) {} void Access(const BlockCacheTraceRecord& access) override; + + protected: + // Access the key-value pair and returns true upon a cache miss. + void AccessKVPair(const Slice& key, uint64_t value_size, + Cache::Priority priority, bool no_insert, + bool is_user_access, bool* is_cache_miss, bool* admitted, + bool update_metrics); + + Cache::Priority ComputeBlockPriority( + const BlockCacheTraceRecord& access) const; +}; + +// A hybrid row and block cache simulator. It looks up/inserts key-value pairs +// referenced by Get/MultiGet requests, and not their accessed index/filter/data +// blocks. +// +// Upon a Get/MultiGet request, it looks up the referenced key first. +// If it observes a cache hit, future block accesses on this key-value pair is +// skipped since the request is served already. Otherwise, it continues to look +// up/insert its index/filter/data blocks. It also inserts the referenced +// key-value pair in the cache for future lookups. +class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { + public: + HybridRowBlockCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache, + bool insert_blocks_upon_row_kvpair_miss) + : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache), + insert_blocks_upon_row_kvpair_miss_( + insert_blocks_upon_row_kvpair_miss) {} + void Access(const BlockCacheTraceRecord& access) override; + + private: + // Row key is a concatenation of the access's fd_number and the referenced + // user key. + // TODO(haoyu): the row key should contain sequence number. + std::string ComputeRowKey(const BlockCacheTraceRecord& access); + + enum InsertResult : char { + INSERTED, + ADMITTED, + NO_INSERT, + }; + + // A map stores get_id to a map of row keys. For each row key, it stores a + // boolean and an enum. The first bool is true when we observe a miss upon the + // first time we encounter the row key. The second arg is INSERTED when the + // kv-pair has been inserted into the cache, ADMITTED if it should be inserted + // but haven't been, NO_INSERT if it should not be inserted. + // + // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not + // know its size. This may happen if the first access on the referenced key is + // an index/filter block. + std::map>> + getid_getkeys_map_; + bool insert_blocks_upon_row_kvpair_miss_; }; // A block cache simulator that reports miss ratio curves given a set of cache diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc new file mode 100644 index 00000000000..fb0c9e84976 --- /dev/null +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -0,0 +1,337 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/simulator_cache/cache_simulator.h" + +#include +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace rocksdb { +namespace { +const std::string kBlockKeyPrefix = "test-block-"; +const std::string kRefKeyPrefix = "test-get-"; +const uint64_t kGetId = 1; +const uint64_t kGetBlockId = 100; +const uint64_t kCompactionBlockId = 1000; +const uint64_t kCacheSize = 1024 * 1024 * 1024; +const uint64_t kGhostCacheSize = 1024 * 1024; +} // namespace + +class CacheSimulatorTest : public testing::Test { + public: + const size_t kNumBlocks = 5; + const size_t kValueSize = 1000; + + CacheSimulatorTest() { env_ = rocksdb::Env::Default(); } + + BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) { + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceDataBlock; + record.block_size = 4096; + record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId); + record.access_timestamp = env_->NowMicros(); + record.cf_id = 0; + record.cf_name = "test"; + record.caller = TableReaderCaller::kUserGet; + record.level = 6; + record.sst_fd_number = kGetBlockId; + record.get_id = getid; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + record.referenced_key = + kRefKeyPrefix + std::to_string(kGetId) + std::string(8, 'c'); + record.referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_data_size = 100; + record.num_keys_in_block = 300; + return record; + } + + BlockCacheTraceRecord GenerateCompactionRecord() { + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceDataBlock; + record.block_size = 4096; + record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId); + record.access_timestamp = env_->NowMicros(); + record.cf_id = 0; + record.cf_name = "test"; + record.caller = TableReaderCaller::kCompaction; + record.level = 6; + record.sst_fd_number = kCompactionBlockId; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kTrue; + return record; + } + + Env* env_; +}; + +TEST_F(CacheSimulatorTest, GhostCache) { + const std::string key1 = "test1"; + const std::string key2 = "test2"; + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + EXPECT_FALSE(ghost_cache->Admit(key1)); + EXPECT_TRUE(ghost_cache->Admit(key1)); + EXPECT_TRUE(ghost_cache->Admit(key1)); + EXPECT_FALSE(ghost_cache->Admit(key2)); + EXPECT_TRUE(ghost_cache->Admit(key2)); +} + +TEST_F(CacheSimulatorTest, CacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord(); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new CacheSimulator(nullptr, sim_cache)); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio()); + ASSERT_EQ(2, cache_simulator->user_accesses()); + ASSERT_EQ(50, cache_simulator->user_miss_ratio()); + + cache_simulator->Access(compaction_access); + cache_simulator->Access(compaction_access); + ASSERT_EQ(4, cache_simulator->total_accesses()); + ASSERT_EQ(75, cache_simulator->miss_ratio()); + ASSERT_EQ(2, cache_simulator->user_accesses()); + ASSERT_EQ(50, cache_simulator->user_miss_ratio()); + + cache_simulator->reset_counter(); + ASSERT_EQ(0, cache_simulator->total_accesses()); + ASSERT_EQ(-1, cache_simulator->miss_ratio()); + auto handle = sim_cache->Lookup(access.block_key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + handle = sim_cache->Lookup(compaction_access.block_key); + ASSERT_EQ(nullptr, handle); +} + +TEST_F(CacheSimulatorTest, GhostCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + std::unique_ptr cache_simulator(new CacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->total_accesses()); + // Both of them will be miss since we have a ghost cache. + ASSERT_EQ(100, cache_simulator->miss_ratio()); +} + +TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new PrioritizedCacheSimulator(nullptr, sim_cache)); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio()); + + auto handle = sim_cache->Lookup(access.block_key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); +} + +TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + std::unique_ptr cache_simulator( + new PrioritizedCacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->total_accesses()); + // Both of them will be miss since we have a ghost cache. + ASSERT_EQ(100, cache_simulator->miss_ratio()); +} + +TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { + uint64_t block_id = 100; + BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); + BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1); + second_get.referenced_data_size = 0; + second_get.referenced_key_exist_in_block = Boolean::kFalse; + second_get.referenced_key = kRefKeyPrefix + std::to_string(kGetId); + BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2); + third_get.referenced_data_size = 0; + third_get.referenced_key_exist_in_block = Boolean::kFalse; + third_get.referenced_key = kRefKeyPrefix + "third_get"; + // We didn't find the referenced key in the third get. + third_get.referenced_key_exist_in_block = Boolean::kFalse; + third_get.referenced_data_size = 0; + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true)); + // The first get request accesses 10 blocks. We should only report 10 accesses + // and 100% miss. + for (uint32_t i = 0; i < 10; i++) { + first_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(first_get); + block_id++; + } + ASSERT_EQ(10, cache_simulator->total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio()); + ASSERT_EQ(10, cache_simulator->user_accesses()); + ASSERT_EQ(100, cache_simulator->user_miss_ratio()); + auto handle = + sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) + + "_" + first_get.referenced_key)); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + } + + // The second get request accesses the same key. We should report 15 + // access and 66% miss, 10 misses with 15 accesses. + // We do not consider these 5 block lookups as misses since the row hits the + // cache. + for (uint32_t i = 0; i < 5; i++) { + second_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(second_get); + block_id++; + } + ASSERT_EQ(15, cache_simulator->total_accesses()); + ASSERT_EQ(66, static_cast(cache_simulator->miss_ratio())); + ASSERT_EQ(15, cache_simulator->user_accesses()); + ASSERT_EQ(66, static_cast(cache_simulator->user_miss_ratio())); + handle = sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" + + second_get.referenced_key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + if (i < 110) { + ASSERT_NE(nullptr, handle) << i; + sim_cache->Release(handle); + } else { + ASSERT_EQ(nullptr, handle) << i; + } + } + + // The third get on a different key and does not have a size. + // This key should not be inserted into the cache. + for (uint32_t i = 0; i < 5; i++) { + third_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(third_get); + block_id++; + } + ASSERT_EQ(20, cache_simulator->total_accesses()); + ASSERT_EQ(75, static_cast(cache_simulator->miss_ratio())); + ASSERT_EQ(20, cache_simulator->user_accesses()); + ASSERT_EQ(75, static_cast(cache_simulator->user_miss_ratio())); + // Assert that the third key is not inserted into the cache. + handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" + + third_get.referenced_key); + ASSERT_EQ(nullptr, handle); + for (uint32_t i = 100; i < block_id; i++) { + if (i < 110 || i >= 115) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_NE(nullptr, handle) << i; + sim_cache->Release(handle); + } else { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_EQ(nullptr, handle) << i; + } + } +} + +TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) { + uint64_t block_id = 100; + BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false)); + for (uint32_t i = 0; i < 9; i++) { + first_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(first_get); + block_id++; + } + auto handle = + sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) + + "_" + first_get.referenced_key)); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + // All blocks are missing from the cache since insert_blocks_row_kvpair_misses + // is set to false. + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_EQ(nullptr, handle); + } +} + +TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) { + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId); + const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1); + const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0), + /*insert_blocks_row_kvpair_misses=*/false)); + // Two get requests access the same key. + cache_simulator->Access(first_get); + cache_simulator->Access(second_get); + ASSERT_EQ(2, cache_simulator->total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio()); + ASSERT_EQ(2, cache_simulator->user_accesses()); + ASSERT_EQ(100, cache_simulator->user_miss_ratio()); + // We insert the key-value pair upon the second get request. A third get + // request should observe a hit. + for (uint32_t i = 0; i < 10; i++) { + cache_simulator->Access(third_get); + } + ASSERT_EQ(12, cache_simulator->total_accesses()); + ASSERT_EQ(16, static_cast(cache_simulator->miss_ratio())); + ASSERT_EQ(12, cache_simulator->user_accesses()); + ASSERT_EQ(16, static_cast(cache_simulator->user_miss_ratio())); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 3e9c5a35237d0ae5d1d8b0499b4dd8844e0ec56d Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 12 Jul 2019 16:52:15 -0700 Subject: [PATCH 217/572] Block cache analyzer: Add more stats (#5516) Summary: This PR provides more command line options for block cache analyzer to better understand block cache access pattern. -analyze_bottom_k_access_count_blocks -analyze_top_k_access_count_blocks -reuse_lifetime_labels -reuse_lifetime_buckets -analyze_callers -access_count_buckets -analyze_blocks_reuse_k_reuse_window Pull Request resolved: https://github.com/facebook/rocksdb/pull/5516 Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32 Differential Revision: D16037440 Pulled By: HaoyuHuang fbshipit-source-id: b9a4ac0d4712053fab910732077a4d4b91400bc8 --- tools/block_cache_trace_analyzer.cc | 1239 +++++++++++++++++----- tools/block_cache_trace_analyzer.h | 134 ++- tools/block_cache_trace_analyzer_test.cc | 262 ++++- trace_replay/block_cache_tracer.cc | 2 + trace_replay/block_cache_tracer.h | 3 + 5 files changed, 1312 insertions(+), 328 deletions(-) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index bd8d8971bfc..76633846257 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -44,20 +44,14 @@ DEFINE_bool(print_data_block_access_count_stats, false, DEFINE_int32(cache_sim_warmup_seconds, 0, "The number of seconds to warmup simulated caches. The hit/miss " "counters are reset after the warmup completes."); -DEFINE_string( - block_cache_analysis_result_dir, "", - "The directory that saves block cache analysis results. It contains 1) a " - "mrc file that saves the computed miss ratios for simulated caches. Its " - "format is " - "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several " - "\"label_access_timeline\" files that contain number of accesses per " - "second grouped by the label. File format: " - "time,label_1_access_per_second,label_2_access_per_second,...,label_N_" - "access_per_second where N is the number of unique labels found in the " - "trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" " - "csv files that contain the reuse distance/interval grouped by label. File " - "format: bucket,label_1,label_2,...,label_N. The first N buckets are " - "absolute values. The second N buckets are percentage values."); +DEFINE_int32(analyze_bottom_k_access_count_blocks, 0, + "Print out detailed access information for blocks with their " + "number of accesses are the bottom k among all blocks."); +DEFINE_int32(analyze_top_k_access_count_blocks, 0, + "Print out detailed access information for blocks with their " + "number of accesses are the top k among all blocks."); +DEFINE_string(block_cache_analysis_result_dir, "", + "The directory that saves block cache analysis results."); DEFINE_string( timeline_labels, "", "Group the number of accesses per block per second using these labels. " @@ -92,6 +86,42 @@ DEFINE_string( "seconds, between 10 seconds and 100 seconds, respectively. The last " "bucket contains the number of blocks with reuse interval longer than 100 " "seconds."); +DEFINE_string( + reuse_lifetime_labels, "", + "Group the reuse lifetime of a block using these labels. Reuse " + "lifetime is defined as the time interval between the first access on a " + "block and the last access on the same block. For blocks that are only " + "accessed once, its lifetime is set to kMaxUint64."); +DEFINE_string( + reuse_lifetime_buckets, "", + "Group blocks by their reuse lifetime given these buckets. For " + "example, if 'reuse_lifetime_buckets' is '1,10,100', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse lifetime less than 1 second, between 1 second and 10 " + "seconds, between 10 seconds and 100 seconds, respectively. The last " + "bucket contains the number of blocks with reuse lifetime longer than 100 " + "seconds."); +DEFINE_string( + analyze_callers, "", + "The list of callers to perform a detailed analysis on. If speicfied, the " + "analyzer will output a detailed percentage of accesses for each caller " + "break down by column family, level, and block type. A list of available " + "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, " + "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, " + "CompactionRefill, Flush, SSTFileReader, Uncategorized."); +DEFINE_string(access_count_buckets, "", + "Group number of blocks by their access count given these " + "buckets. If specified, the analyzer will output a detailed " + "analysis on the number of blocks grouped by their access count " + "break down by block type and column family."); +DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0, + "Analyze the percentage of blocks that are accessed in the " + "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], " + "[3*k, 4*k],...,[k*(n-1), k*n] seconds. "); +DEFINE_string(analyze_get_spatial_locality_labels, "", + "Group data blocks using these labels."); +DEFINE_string(analyze_get_spatial_locality_buckets, "", + "Group data blocks by their statistics using these buckets."); namespace rocksdb { namespace { @@ -112,6 +142,24 @@ const std::string kSupportedCacheNames = "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss " "ghost_lru_hybrid_no_insert_on_row_miss "; +// The suffix for the generated csv files. +const std::string kFileNameSuffixAccessTimeline = "access_timeline"; +const std::string kFileNameSuffixAvgReuseIntervalNaccesses = + "avg_reuse_interval_naccesses"; +const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval"; +const std::string kFileNameSuffixReuseInterval = "access_reuse_interval"; +const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime"; +const std::string kFileNameSuffixAccessReuseBlocksTimeline = + "reuse_blocks_timeline"; +const std::string kFileNameSuffixPercentOfAccessSummary = + "percentage_of_accesses_summary"; +const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys"; +const std::string kFileNameSuffixPercentDataSizeOnRefKeys = + "percent_data_size_on_ref_keys"; +const std::string kFileNameSuffixPercentAccessesOnRefKeys = + "percent_accesses_on_ref_keys"; +const std::string kFileNameSuffixAccessCountSummary = "access_count_summary"; + std::string block_type_to_string(TraceType type) { switch (type) { case kBlockTraceFilterBlock: @@ -168,6 +216,53 @@ std::string caller_to_string(TableReaderCaller caller) { return "InvalidCaller"; } +TableReaderCaller string_to_caller(std::string caller_str) { + if (caller_str == "Get") { + return kUserGet; + } else if (caller_str == "MultiGet") { + return kUserMultiGet; + } else if (caller_str == "Iterator") { + return kUserIterator; + } else if (caller_str == "ApproximateSize") { + return kUserApproximateSize; + } else if (caller_str == "VerifyChecksum") { + return kUserVerifyChecksum; + } else if (caller_str == "SSTDumpTool") { + return kSSTDumpTool; + } else if (caller_str == "ExternalSSTIngestion") { + return kExternalSSTIngestion; + } else if (caller_str == "Repair") { + return kRepair; + } else if (caller_str == "Prefetch") { + return kPrefetch; + } else if (caller_str == "Compaction") { + return kCompaction; + } else if (caller_str == "CompactionRefill") { + return kCompactionRefill; + } else if (caller_str == "Flush") { + return kFlush; + } else if (caller_str == "SSTFileReader") { + return kSSTFileReader; + } else if (caller_str == "Uncategorized") { + return kUncategorized; + } + return TableReaderCaller::kMaxBlockCacheLookupCaller; +} + +bool is_user_access(TableReaderCaller caller) { + switch (caller) { + case kUserGet: + case kUserMultiGet: + case kUserIterator: + case kUserApproximateSize: + case kUserVerifyChecksum: + return true; + default: + break; + } + return false; +} + const char kBreakLine[] = "***************************************************************\n"; @@ -248,7 +343,7 @@ std::set BlockCacheTraceAnalyzer::ParseLabelStr( std::string BlockCacheTraceAnalyzer::BuildLabel( const std::set& labels, const std::string& cf_name, uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller, - const std::string& block_key) const { + uint64_t block_key) const { std::map label_value_map; label_value_map[kGroupbyAll] = kGroupbyAll; label_value_map[kGroupbyLevel] = std::to_string(level); @@ -256,7 +351,7 @@ std::string BlockCacheTraceAnalyzer::BuildLabel( label_value_map[kGroupbySSTFile] = std::to_string(fd); label_value_map[kGroupbyBlockType] = block_type_to_string(type); label_value_map[kGroupbyColumnFamily] = cf_name; - label_value_map[kGroupbyBlock] = block_key; + label_value_map[kGroupbyBlock] = std::to_string(block_key); // Concatenate the label values. std::string label; for (auto const& l : labels) { @@ -269,12 +364,14 @@ std::string BlockCacheTraceAnalyzer::BuildLabel( return label; } -void BlockCacheTraceAnalyzer::WriteAccessTimeline( - const std::string& label_str) const { - std::set labels = ParseLabelStr(label_str); - uint64_t start_time = port::kMaxUint64; - uint64_t end_time = 0; - std::map> label_access_timeline; +void BlockCacheTraceAnalyzer::TraverseBlocks( + std::function + block_callback) const { + uint64_t block_id = 0; for (auto const& cf_aggregates : cf_aggregates_map_) { // Stats per column family. const std::string& cf_name = cf_aggregates.first; @@ -289,42 +386,161 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline( for (auto const& block_access_info : block_type_aggregates.second.block_access_info_map) { // Stats per block. - for (auto const& timeline : - block_access_info.second.caller_num_accesses_timeline) { - const TableReaderCaller caller = timeline.first; - const std::string& block_key = block_access_info.first; - const std::string label = - BuildLabel(labels, cf_name, fd, level, type, caller, block_key); - for (auto const& naccess : timeline.second) { - const uint64_t timestamp = naccess.first; - const uint64_t num = naccess.second; - label_access_timeline[label][timestamp] += num; - start_time = std::min(start_time, timestamp); - end_time = std::max(end_time, timestamp); - } - } + block_callback(cf_name, fd, level, type, block_access_info.first, + block_id, block_access_info.second); + block_id++; } } } } +} + +void BlockCacheTraceAnalyzer::WriteGetSpatialLocality( + const std::string& label_str, + const std::vector& percent_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_pnrefkeys_nblocks; + std::map> label_pnrefs_nblocks; + std::map> label_pndatasize_nblocks; + uint64_t nblocks = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType /*block_type*/, + const std::string& /*block_key*/, + uint64_t /*block_key_id*/, + const BlockAccessInfo& block) { + if (block.num_keys == 0) { + return; + } + uint64_t naccesses = 0; + for (auto const& key_access : block.key_num_access_map) { + for (auto const& caller_access : key_access.second) { + if (caller_access.first == TableReaderCaller::kUserGet) { + naccesses += caller_access.second; + } + } + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock, + TableReaderCaller::kUserGet, /*block_id=*/0); + + const uint64_t percent_referenced_for_existing_keys = + static_cast(std::max( + percent(block.key_num_access_map.size(), block.num_keys), 0.0)); + const uint64_t percent_accesses_for_existing_keys = + static_cast(std::max( + percent(block.num_referenced_key_exist_in_block, naccesses), 0.0)); + const uint64_t percent_referenced_data_size = static_cast( + std::max(percent(block.referenced_data_size, block.block_size), 0.0)); + if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) { + for (auto const& percent_bucket : percent_buckets) { + label_pnrefkeys_nblocks[label][percent_bucket] = 0; + label_pnrefs_nblocks[label][percent_bucket] = 0; + label_pndatasize_nblocks[label][percent_bucket] = 0; + } + } + label_pnrefkeys_nblocks[label] + .upper_bound(percent_referenced_for_existing_keys) + ->second += 1; + label_pnrefs_nblocks[label] + .upper_bound(percent_accesses_for_existing_keys) + ->second += 1; + label_pndatasize_nblocks[label] + .upper_bound(percent_referenced_data_size) + ->second += 1; + nblocks += 1; + }; + TraverseBlocks(block_callback); + WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys, + label_pnrefkeys_nblocks, nblocks); + WriteStatsToFile(label_str, percent_buckets, + kFileNameSuffixPercentAccessesOnRefKeys, + label_pnrefs_nblocks, nblocks); + WriteStatsToFile(label_str, percent_buckets, + kFileNameSuffixPercentDataSizeOnRefKeys, + label_pndatasize_nblocks, nblocks); +} + +void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str, + uint64_t time_unit, + bool user_access_only) const { + std::set labels = ParseLabelStr(label_str); + uint64_t start_time = port::kMaxUint64; + uint64_t end_time = 0; + std::map> label_access_timeline; + std::map> access_count_block_id_map; + + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + uint64_t naccesses = 0; + for (auto const& timeline : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + if (user_access_only && !is_user_access(caller)) { + continue; + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, caller, block_id); + for (auto const& naccess : timeline.second) { + const uint64_t timestamp = naccess.first / time_unit; + const uint64_t num = naccess.second; + label_access_timeline[label][timestamp] += num; + start_time = std::min(start_time, timestamp); + end_time = std::max(end_time, timestamp); + naccesses += num; + } + } + if (naccesses > 0) { + access_count_block_id_map[naccesses].push_back(std::to_string(block_id)); + } + }; + TraverseBlocks(block_callback); // We have label_access_timeline now. Write them into a file. - const std::string output_path = - output_dir_ + "/" + label_str + "_access_timeline"; + const std::string user_access_prefix = + user_access_only ? "user_access_only_" : "all_access_"; + const std::string output_path = output_dir_ + "/" + user_access_prefix + + label_str + "_" + std::to_string(time_unit) + + "_" + kFileNameSuffixAccessTimeline; std::ofstream out(output_path); if (!out.is_open()) { return; } std::string header("time"); - for (auto const& label : label_access_timeline) { + if (labels.find("block") != labels.end()) { + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + // Write the most frequently accessed blocks first. + for (auto naccess_it = access_count_block_id_map.rbegin(); + naccess_it != access_count_block_id_map.rend(); naccess_it++) { + for (auto& block_id_it : naccess_it->second) { + std::string row(block_id_it); + for (uint64_t now = start_time; now <= end_time; now++) { + auto it = label_access_timeline[block_id_it].find(now); + row += ","; + if (it != label_access_timeline[block_id_it].end()) { + row += std::to_string(it->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + } + out.close(); + return; + } + for (uint64_t now = start_time; now <= end_time; now++) { header += ","; - header += label.first; + header += std::to_string(now); } out << header << std::endl; - std::string row; - for (uint64_t now = start_time; now <= end_time; now++) { - row = std::to_string(now); - for (auto const& label : label_access_timeline) { + for (auto const& label : label_access_timeline) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { auto it = label.second.find(now); row += ","; if (it != label.second.end()) { @@ -335,52 +551,38 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline( } out << row << std::endl; } + out.close(); } void BlockCacheTraceAnalyzer::WriteReuseDistance( const std::string& label_str, - const std::set& distance_buckets) const { + const std::vector& distance_buckets) const { std::set labels = ParseLabelStr(label_str); std::map> label_distance_num_reuses; uint64_t total_num_reuses = 0; - for (auto const& cf_aggregates : cf_aggregates_map_) { - // Stats per column family. - const std::string& cf_name = cf_aggregates.first; - for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - // Stats per SST file. - const uint64_t fd = file_aggregates.first; - const uint32_t level = file_aggregates.second.level; - for (auto const& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - // Stats per block type. - const TraceType type = block_type_aggregates.first; - for (auto const& block_access_info : - block_type_aggregates.second.block_access_info_map) { - // Stats per block. - const std::string& block_key = block_access_info.first; - const std::string label = BuildLabel( - labels, cf_name, fd, level, type, - TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); - if (label_distance_num_reuses.find(label) == - label_distance_num_reuses.end()) { - // The first time we encounter this label. - for (auto const& distance_bucket : distance_buckets) { - label_distance_num_reuses[label][distance_bucket] = 0; - } - } - for (auto const& reuse_distance : - block_access_info.second.reuse_distance_count) { - label_distance_num_reuses[label] - .upper_bound(reuse_distance.first) - ->second += reuse_distance.second; - total_num_reuses += reuse_distance.second; - } - } + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + if (label_distance_num_reuses.find(label) == + label_distance_num_reuses.end()) { + // The first time we encounter this label. + for (auto const& distance_bucket : distance_buckets) { + label_distance_num_reuses[label][distance_bucket] = 0; } } - } - + for (auto const& reuse_distance : block.reuse_distance_count) { + label_distance_num_reuses[label] + .upper_bound(reuse_distance.first) + ->second += reuse_distance.second; + total_num_reuses += reuse_distance.second; + } + }; + TraverseBlocks(block_callback); // We have label_naccesses and label_distance_num_reuses now. Write them into // a file. const std::string output_path = @@ -395,18 +597,6 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance( header += label_it.first; } out << header << std::endl; - // Absolute values. - for (auto const& bucket : distance_buckets) { - std::string row(std::to_string(bucket)); - for (auto const& label_it : label_distance_num_reuses) { - auto const& it = label_it.second.find(bucket); - assert(it != label_it.second.end()); - row += ","; - row += std::to_string(it->second); - } - out << row << std::endl; - } - // Percentage values. for (auto const& bucket : distance_buckets) { std::string row(std::to_string(bucket)); for (auto const& label_it : label_distance_num_reuses) { @@ -421,7 +611,7 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance( } void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats( - const std::string& label, const std::set& time_buckets, + const std::string& label, const std::vector& time_buckets, const std::map timeline, std::map>* label_time_num_reuses, uint64_t* total_num_reuses) const { @@ -434,119 +624,434 @@ void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats( } } auto it = timeline.begin(); - const uint64_t prev_timestamp = it->first; + uint64_t prev_timestamp = it->first; const uint64_t prev_num = it->second; it++; // Reused within one second. if (prev_num > 1) { - (*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1; + (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1; *total_num_reuses += prev_num - 1; } while (it != timeline.end()) { const uint64_t timestamp = it->first; const uint64_t num = it->second; const uint64_t reuse_interval = timestamp - prev_timestamp; - (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num; + (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1; + if (num > 1) { + (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1; + } + prev_timestamp = timestamp; *total_num_reuses += num; + it++; + } +} + +void BlockCacheTraceAnalyzer::WriteStatsToFile( + const std::string& label_str, const std::vector& time_buckets, + const std::string& filename_suffix, + const std::map>& label_data, + uint64_t ntotal) const { + const std::string output_path = + output_dir_ + "/" + label_str + "_" + filename_suffix; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_data) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_data) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, ntotal)); + } + out << row << std::endl; } + out.close(); } void BlockCacheTraceAnalyzer::WriteReuseInterval( const std::string& label_str, - const std::set& time_buckets) const { + const std::vector& time_buckets) const { std::set labels = ParseLabelStr(label_str); std::map> label_time_num_reuses; + std::map> label_avg_reuse_nblocks; + std::map> label_avg_reuse_naccesses; + uint64_t total_num_reuses = 0; - for (auto const& cf_aggregates : cf_aggregates_map_) { - // Stats per column family. - const std::string& cf_name = cf_aggregates.first; - for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - // Stats per SST file. - const uint64_t fd = file_aggregates.first; - const uint32_t level = file_aggregates.second.level; - for (auto const& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - // Stats per block type. - const TraceType type = block_type_aggregates.first; - for (auto const& block_access_info : - block_type_aggregates.second.block_access_info_map) { - // Stats per block. - const std::string& block_key = block_access_info.first; - if (labels.find(kGroupbyCaller) != labels.end()) { - for (auto const& timeline : - block_access_info.second.caller_num_accesses_timeline) { - const TableReaderCaller caller = timeline.first; - const std::string label = BuildLabel(labels, cf_name, fd, level, - type, caller, block_key); - UpdateReuseIntervalStats(label, time_buckets, timeline.second, - &label_time_num_reuses, - &total_num_reuses); - } - continue; - } - // Does not group by caller so we need to flatten the access timeline. - const std::string label = BuildLabel( - labels, cf_name, fd, level, type, - TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); - std::map timeline; - for (auto const& caller_timeline : - block_access_info.second.caller_num_accesses_timeline) { - for (auto const& time_naccess : caller_timeline.second) { - timeline[time_naccess.first] += time_naccess.second; - } - } - UpdateReuseIntervalStats(label, time_buckets, timeline, - &label_time_num_reuses, &total_num_reuses); - } + uint64_t total_nblocks = 0; + uint64_t total_accesses = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + total_nblocks++; + total_accesses += block.num_accesses; + uint64_t avg_reuse_interval = 0; + if (block.num_accesses > 1) { + avg_reuse_interval = ((block.last_access_time - block.first_access_time) / + kMicrosInSecond) / + block.num_accesses; + } else { + avg_reuse_interval = port::kMaxUint64 - 1; + } + if (labels.find(kGroupbyCaller) != labels.end()) { + for (auto const& timeline : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, caller, block_id); + UpdateReuseIntervalStats(label, time_buckets, timeline.second, + &label_time_num_reuses, &total_num_reuses); + } + return; + } + // Does not group by caller so we need to flatten the access timeline. + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + std::map timeline; + for (auto const& caller_timeline : block.caller_num_accesses_timeline) { + for (auto const& time_naccess : caller_timeline.second) { + timeline[time_naccess.first] += time_naccess.second; + } + } + UpdateReuseIntervalStats(label, time_buckets, timeline, + &label_time_num_reuses, &total_num_reuses); + if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) { + for (auto const& time_bucket : time_buckets) { + label_avg_reuse_nblocks[label][time_bucket] = 0; + label_avg_reuse_naccesses[label][time_bucket] = 0; + } + } + label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1; + label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second += + block.num_accesses; + }; + TraverseBlocks(block_callback); + + // Write the stats into files. + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval, + label_time_num_reuses, total_num_reuses); + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval, + label_avg_reuse_nblocks, total_nblocks); + WriteStatsToFile(label_str, time_buckets, + kFileNameSuffixAvgReuseIntervalNaccesses, + label_avg_reuse_naccesses, total_accesses); +} + +void BlockCacheTraceAnalyzer::WriteReuseLifetime( + const std::string& label_str, + const std::vector& time_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_lifetime_nblocks; + uint64_t total_nblocks = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + uint64_t lifetime = 0; + if (block.num_accesses > 1) { + lifetime = + (block.last_access_time - block.first_access_time) / kMicrosInSecond; + } else { + lifetime = port::kMaxUint64 - 1; + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + + if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) { + // The first time we encounter this label. + for (auto const& time_bucket : time_buckets) { + label_lifetime_nblocks[label][time_bucket] = 0; } } + label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1; + total_nblocks += 1; + }; + TraverseBlocks(block_callback); + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime, + label_lifetime_nblocks, total_nblocks); +} + +void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline( + uint64_t reuse_window, bool user_access_only, TraceType block_type) const { + // A map from block key to an array of bools that states whether a block is + // accessed in a time window. + std::map> block_accessed; + const uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + const uint64_t reuse_vector_size = (trace_duration / reuse_window); + if (reuse_vector_size < 2) { + // The reuse window is less than 2. We cannot calculate the reused + // percentage of blocks. + return; } + auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/, + uint32_t /*level*/, TraceType /*type*/, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + if (block_accessed.find(block_id) == block_accessed.end()) { + block_accessed[block_id].resize(reuse_vector_size); + for (uint64_t i = 0; i < reuse_vector_size; i++) { + block_accessed[block_id][i] = false; + } + } + for (auto const& caller_num : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = caller_num.first; + for (auto const& timeline : caller_num.second) { + const uint64_t timestamp = timeline.first; + const uint64_t elapsed_time = + timestamp - trace_start_timestamp_in_seconds_; + if (!user_access_only || (user_access_only && is_user_access(caller))) { + uint64_t index = + std::min(elapsed_time / reuse_window, reuse_vector_size - 1); + block_accessed[block_id][index] = true; + } + } + } + }; + TraverseBlocks(block_callback); - // We have label_naccesses and label_interval_num_reuses now. Write them into - // a file. + // A cell is the number of blocks accessed in a reuse window. + uint64_t reuse_table[reuse_vector_size][reuse_vector_size]; + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { + // Initialize the reuse_table. + for (uint64_t i = 0; i < reuse_vector_size; i++) { + reuse_table[start_time][i] = 0; + } + // Examine all blocks. + for (auto const& block : block_accessed) { + for (uint64_t i = start_time; i < reuse_vector_size; i++) { + if (block.second[start_time] && block.second[i]) { + // This block is accessed at start time and at the current time. We + // increment reuse_table[start_time][i] since it is reused at the ith + // window. + reuse_table[start_time][i]++; + } + } + } + } + const std::string user_access_prefix = + user_access_only ? "_user_access_only_" : "_all_access_"; const std::string output_path = - output_dir_ + "/" + label_str + "_reuse_interval"; + output_dir_ + "/" + block_type_to_string(block_type) + + user_access_prefix + std::to_string(reuse_window) + "_" + + kFileNameSuffixAccessReuseBlocksTimeline; std::ofstream out(output_path); if (!out.is_open()) { return; } - std::string header("bucket"); - for (auto const& label_it : label_time_num_reuses) { + std::string header("start_time"); + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { header += ","; - header += label_it.first; + header += std::to_string(start_time); } out << header << std::endl; - // Absolute values. - for (auto const& bucket : time_buckets) { - std::string row(std::to_string(bucket)); - for (auto const& label_it : label_time_num_reuses) { - auto const& it = label_it.second.find(bucket); - assert(it != label_it.second.end()); + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { + std::string row(std::to_string(start_time * reuse_window)); + for (uint64_t j = 0; j < reuse_vector_size; j++) { row += ","; - row += std::to_string(it->second); + if (j < start_time) { + row += "100.0"; + } else { + row += std::to_string(percent(reuse_table[start_time][j], + reuse_table[start_time][start_time])); + } } out << row << std::endl; } - // Percentage values. - for (auto const& bucket : time_buckets) { - std::string row(std::to_string(bucket)); - for (auto const& label_it : label_time_num_reuses) { - auto const& it = label_it.second.find(bucket); - assert(it != label_it.second.end()); - row += ","; - row += std::to_string(percent(it->second, total_num_reuses)); + out.close(); +} + +std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats( + uint64_t total_accesses, + const std::map& cf_access_count) const { + std::string row; + for (auto const& cf_aggregates : cf_aggregates_map_) { + const std::string& cf_name = cf_aggregates.first; + const auto& naccess = cf_access_count.find(cf_name); + row += ","; + if (naccess != cf_access_count.end()) { + row += std::to_string(percent(naccess->second, total_accesses)); + } else { + row += "0"; } + } + return row; +} + +void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const { + std::map> + caller_cf_accesses; + uint64_t total_accesses = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType /*type*/, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + for (auto const& caller_num : block.caller_num_access_map) { + const TableReaderCaller caller = caller_num.first; + const uint64_t naccess = caller_num.second; + caller_cf_accesses[caller][cf_name] += naccess; + total_accesses += naccess; + } + }; + TraverseBlocks(block_callback); + + const std::string output_path = + output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("caller"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& cf_naccess_it : caller_cf_accesses) { + const TableReaderCaller caller = cf_naccess_it.first; + std::string row; + row += caller_to_string(caller); + row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second); out << row << std::endl; } out.close(); } +void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats( + TableReaderCaller analyzing_caller) const { + std::map> level_cf_accesses; + std::map> bt_cf_accesses; + uint64_t total_accesses = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + for (auto const& caller_num : block.caller_num_access_map) { + const TableReaderCaller caller = caller_num.first; + if (caller == analyzing_caller) { + const uint64_t naccess = caller_num.second; + level_cf_accesses[level][cf_name] += naccess; + bt_cf_accesses[type][cf_name] += naccess; + total_accesses += naccess; + } + } + }; + TraverseBlocks(block_callback); + { + const std::string output_path = + output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" + + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("level"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& level_naccess_it : level_cf_accesses) { + const uint32_t level = level_naccess_it.first; + std::string row; + row += std::to_string(level); + row += OutputPercentAccessStats(total_accesses, level_naccess_it.second); + out << row << std::endl; + } + out.close(); + } + { + const std::string output_path = + output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" + + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bt"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& bt_naccess_it : bt_cf_accesses) { + const TraceType bt = bt_naccess_it.first; + std::string row; + row += block_type_to_string(bt); + row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second); + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats( + const std::vector& access_count_buckets, + bool user_access_only) const { + // x: buckets. + // y: # of accesses. + std::map> bt_access_nblocks; + std::map> cf_access_nblocks; + uint64_t total_nblocks = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + const std::string type_str = block_type_to_string(type); + if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) { + // initialize. + for (auto& access : access_count_buckets) { + cf_access_nblocks[cf_name][access] = 0; + } + } + if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) { + // initialize. + for (auto& access : access_count_buckets) { + bt_access_nblocks[type_str][access] = 0; + } + } + uint64_t naccesses = 0; + for (auto const& caller_access : block.caller_num_access_map) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + naccesses += caller_access.second; + } + } + if (naccesses == 0) { + return; + } + total_nblocks += 1; + bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1; + cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1; + }; + TraverseBlocks(block_callback); + const std::string user_access_prefix = + user_access_only ? "user_access_only_" : "all_access_"; + WriteStatsToFile("cf", access_count_buckets, + user_access_prefix + kFileNameSuffixAccessCountSummary, + cf_access_nblocks, total_nblocks); + WriteStatsToFile("bt", access_count_buckets, + user_access_prefix + kFileNameSuffixAccessCountSummary, + bt_access_nblocks, total_nblocks); +} + BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( const std::string& trace_file_path, const std::string& output_dir, + bool compute_reuse_distance, std::unique_ptr&& cache_simulator) : env_(rocksdb::Env::Default()), trace_file_path_(trace_file_path), output_dir_(output_dir), + compute_reuse_distance_(compute_reuse_distance), cache_simulator_(std::move(cache_simulator)) {} void BlockCacheTraceAnalyzer::ComputeReuseDistance( @@ -577,19 +1082,28 @@ void BlockCacheTraceAnalyzer::RecordAccess( file_aggr.block_type_aggregates_map[access.block_type]; BlockAccessInfo& block_access_info = block_type_aggr.block_access_info_map[access.block_key]; - ComputeReuseDistance(&block_access_info); + if (compute_reuse_distance_) { + ComputeReuseDistance(&block_access_info); + } block_access_info.AddAccess(access); block_info_map_[access.block_key] = &block_access_info; + if (trace_start_timestamp_in_seconds_ == 0) { + trace_start_timestamp_in_seconds_ = + access.access_timestamp / kMicrosInSecond; + } + trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond; - // Add this block to all existing blocks. - for (auto& cf_aggregates : cf_aggregates_map_) { - for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - for (auto& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - for (auto& existing_block : - block_type_aggregates.second.block_access_info_map) { - existing_block.second.unique_blocks_since_last_access.insert( - access.block_key); + if (compute_reuse_distance_) { + // Add this block to all existing blocks. + for (auto& cf_aggregates : cf_aggregates_map_) { + for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + for (auto& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + for (auto& existing_block : + block_type_aggregates.second.block_access_info_map) { + existing_block.second.unique_blocks_since_last_access.insert( + access.block_key); + } } } } @@ -608,6 +1122,9 @@ Status BlockCacheTraceAnalyzer::Analyze() { if (!s.ok()) { return s; } + uint64_t start = env_->NowMicros(); + uint64_t processed_records = 0; + uint64_t time_interval = 0; while (s.ok()) { BlockCacheTraceRecord access; s = reader.ReadAccess(&access); @@ -618,6 +1135,17 @@ Status BlockCacheTraceAnalyzer::Analyze() { if (cache_simulator_) { cache_simulator_->Access(access); } + processed_records++; + uint64_t now = env_->NowMicros(); + uint64_t duration = (now - start) / kMicrosInSecond; + if (duration > 10 * time_interval) { + fprintf(stdout, + "Running for %" PRIu64 " seconds: Processed %" PRIu64 + " records/second\n", + duration, processed_records / duration); + processed_records = 0; + time_interval++; + } } return Status::OK(); } @@ -626,26 +1154,21 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { HistogramStat bs_stats; std::map bt_stats_map; std::map> cf_bt_stats_map; - for (auto const& cf_aggregates : cf_aggregates_map_) { - // Stats per column family. - const std::string& cf_name = cf_aggregates.first; - for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - // Stats per SST file. - for (auto const& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - // Stats per block type. - const TraceType type = block_type_aggregates.first; - for (auto const& block_access_info : - block_type_aggregates.second.block_access_info_map) { - // Stats per block. - bs_stats.Add(block_access_info.second.block_size); - bt_stats_map[type].Add(block_access_info.second.block_size); - cf_bt_stats_map[cf_name][type].Add( - block_access_info.second.block_size); + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + if (block.block_size == 0) { + // Block size may be 0 when 1) compaction observes a cache miss and + // does not insert the missing block into the cache again. 2) + // fetching filter blocks in SST files at the last level. + return; } - } - } - } + bs_stats.Add(block.block_size); + bt_stats_map[type].Add(block.block_size); + cf_bt_stats_map[cf_name][type].Add(block.block_size); + }; + TraverseBlocks(block_callback); fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str()); for (auto const& bt_stats : bt_stats_map) { print_break_lines(/*num_break_lines=*/1); @@ -665,33 +1188,151 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { } } -void BlockCacheTraceAnalyzer::PrintAccessCountStats() const { +void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only, + uint32_t bottom_k, + uint32_t top_k) const { HistogramStat access_stats; std::map bt_stats_map; std::map> cf_bt_stats_map; - for (auto const& cf_aggregates : cf_aggregates_map_) { - // Stats per column family. - const std::string& cf_name = cf_aggregates.first; - for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - // Stats per SST file. - for (auto const& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - // Stats per block type. - const TraceType type = block_type_aggregates.first; - for (auto const& block_access_info : - block_type_aggregates.second.block_access_info_map) { - // Stats per block. - access_stats.Add(block_access_info.second.num_accesses); - bt_stats_map[type].Add(block_access_info.second.num_accesses); - cf_bt_stats_map[cf_name][type].Add( - block_access_info.second.num_accesses); - } + std::map> access_count_blocks; + auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/, + uint32_t /*level*/, TraceType type, + const std::string& block_key, uint64_t /*block_id*/, + const BlockAccessInfo& block) { + uint64_t naccesses = 0; + for (auto const& caller_access : block.caller_num_access_map) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + naccesses += caller_access.second; } } - } + if (naccesses == 0) { + return; + } + if (type == TraceType::kBlockTraceDataBlock) { + access_count_blocks[naccesses].push_back(block_key); + } + access_stats.Add(naccesses); + bt_stats_map[type].Add(naccesses); + cf_bt_stats_map[cf_name][type].Add(naccesses); + }; + TraverseBlocks(block_callback); fprintf(stdout, - "Block access count stats: The number of accesses per block.\n%s", + "Block access count stats: The number of accesses per block. %s\n%s", + user_access_only ? "User accesses only" : "All accesses", access_stats.ToString().c_str()); + uint32_t bottom_k_index = 0; + for (auto naccess_it = access_count_blocks.begin(); + naccess_it != access_count_blocks.end(); naccess_it++) { + bottom_k_index++; + if (bottom_k_index >= bottom_k) { + break; + } + std::map caller_naccesses; + uint64_t naccesses = 0; + for (auto const& block_id : naccess_it->second) { + BlockAccessInfo* block = block_info_map_.find(block_id)->second; + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + caller_naccesses[caller_access.first] += caller_access.second; + naccesses += caller_access.second; + } + } + } + std::string statistics("Caller:"); + for (auto const& caller_naccessess_it : caller_naccesses) { + statistics += caller_to_string(caller_naccessess_it.first); + statistics += ":"; + statistics += + std::to_string(percent(caller_naccessess_it.second, naccesses)); + statistics += ","; + } + fprintf(stdout, + "Bottom %" PRIu32 " access count. Access count=%" PRIu64 + " nblocks=%" PRIu64 " %s\n", + bottom_k, naccess_it->first, naccess_it->second.size(), + statistics.c_str()); + } + + uint32_t top_k_index = 0; + for (auto naccess_it = access_count_blocks.rbegin(); + naccess_it != access_count_blocks.rend(); naccess_it++) { + top_k_index++; + if (top_k_index >= top_k) { + break; + } + for (auto const& block_id : naccess_it->second) { + BlockAccessInfo* block = block_info_map_.find(block_id)->second; + std::string statistics("Caller:"); + uint64_t naccesses = 0; + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + naccesses += caller_access.second; + } + } + assert(naccesses > 0); + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + statistics += ","; + statistics += caller_to_string(caller_access.first); + statistics += ":"; + statistics += + std::to_string(percent(caller_access.second, naccesses)); + } + } + uint64_t ref_keys_accesses = 0; + uint64_t ref_keys_does_not_exist_accesses = 0; + for (auto const& ref_key_caller_access : block->key_num_access_map) { + for (auto const& caller_access : ref_key_caller_access.second) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + ref_keys_accesses += caller_access.second; + } + } + } + for (auto const& ref_key_caller_access : + block->non_exist_key_num_access_map) { + for (auto const& caller_access : ref_key_caller_access.second) { + if (!user_access_only || + (user_access_only && is_user_access(caller_access.first))) { + ref_keys_does_not_exist_accesses += caller_access.second; + } + } + } + statistics += ",nkeys="; + statistics += std::to_string(block->num_keys); + statistics += ",block_size="; + statistics += std::to_string(block->block_size); + statistics += ",num_ref_keys="; + statistics += std::to_string(block->key_num_access_map.size()); + statistics += ",percent_access_ref_keys="; + statistics += std::to_string(percent(ref_keys_accesses, naccesses)); + statistics += ",num_ref_keys_does_not_exist="; + statistics += std::to_string(block->non_exist_key_num_access_map.size()); + statistics += ",percent_access_ref_keys_does_not_exist="; + statistics += + std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses)); + statistics += ",ref_data_size="; + statistics += std::to_string(block->referenced_data_size); + fprintf(stdout, + "Top %" PRIu32 " access count blocks access_count=%" PRIu64 + " %s\n", + top_k, naccess_it->first, statistics.c_str()); + // if (block->referenced_data_size > block->block_size) { + // for (auto const& ref_key_it : block->key_num_access_map) { + // ParsedInternalKey internal_key; + // ParseInternalKey(ref_key_it.first, &internal_key); + // printf("######%lu %lu %d %s\n", block->referenced_data_size, + // block->block_size, internal_key.type, + // internal_key.user_key.ToString().c_str()); + // } + // } + } + } + for (auto const& bt_stats : bt_stats_map) { print_break_lines(/*num_break_lines=*/1); fprintf(stdout, "Break down by block type %s: \n%s", @@ -727,62 +1368,49 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { HistogramStat stdev_naccesses_per_key_in_a_data_block; std::map cf_stdev_naccesses_per_key_in_a_data_block; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType /*type*/, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + if (block.num_keys == 0) { + return; + } + // Use four decimal points. + uint64_t percent_referenced_for_existing_keys = (uint64_t)( + ((double)block.key_num_access_map.size() / (double)block.num_keys) * + 10000.0); + uint64_t percent_referenced_for_non_existing_keys = + (uint64_t)(((double)block.non_exist_key_num_access_map.size() / + (double)block.num_keys) * + 10000.0); + uint64_t percent_accesses_for_existing_keys = + (uint64_t)(((double)block.num_referenced_key_exist_in_block / + (double)block.num_accesses) * + 10000.0); - for (auto const& cf_aggregates : cf_aggregates_map_) { - // Stats per column family. - const std::string& cf_name = cf_aggregates.first; - for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { - // Stats per SST file. - for (auto const& block_type_aggregates : - file_aggregates.second.block_type_aggregates_map) { - // Stats per block type. - for (auto const& block_access_info : - block_type_aggregates.second.block_access_info_map) { - // Stats per block. - if (block_access_info.second.num_keys == 0) { - continue; + HistogramStat hist_naccess_per_key; + for (auto const& key_access : block.key_num_access_map) { + for (auto const& caller_access : key_access.second) { + hist_naccess_per_key.Add(caller_access.second); } - // Use four decimal points. - uint64_t percent_referenced_for_existing_keys = (uint64_t)( - ((double)block_access_info.second.key_num_access_map.size() / - (double)block_access_info.second.num_keys) * - 10000.0); - uint64_t percent_referenced_for_non_existing_keys = - (uint64_t)(((double)block_access_info.second - .non_exist_key_num_access_map.size() / - (double)block_access_info.second.num_keys) * - 10000.0); - uint64_t percent_accesses_for_existing_keys = (uint64_t)( - ((double) - block_access_info.second.num_referenced_key_exist_in_block / - (double)block_access_info.second.num_accesses) * - 10000.0); - - HistogramStat hist_naccess_per_key; - for (auto const& key_access : - block_access_info.second.key_num_access_map) { - hist_naccess_per_key.Add(key_access.second); - } - uint64_t avg_accesses = hist_naccess_per_key.Average(); - uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation(); - avg_naccesses_per_key_in_a_data_block.Add(avg_accesses); - cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses); - stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses); - cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add( - stdev_accesses); - - existing_keys_stats.Add(percent_referenced_for_existing_keys); - cf_existing_keys_stats_map[cf_name].Add( - percent_referenced_for_existing_keys); - non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys); - cf_non_existing_keys_stats_map[cf_name].Add( - percent_referenced_for_non_existing_keys); - block_access_stats.Add(percent_accesses_for_existing_keys); - cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys); } - } - } - } + uint64_t avg_accesses = hist_naccess_per_key.Average(); + uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation(); + avg_naccesses_per_key_in_a_data_block.Add(avg_accesses); + cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses); + stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses); + cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses); + + existing_keys_stats.Add(percent_referenced_for_existing_keys); + cf_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_existing_keys); + non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys); + cf_non_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_non_existing_keys); + block_access_stats.Add(percent_accesses_for_existing_keys); + cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys); + }; + TraverseBlocks(block_callback); fprintf(stdout, "Histogram on the number of referenced keys existing in a block over " "the total number of keys in a block: \n%s", @@ -1032,15 +1660,15 @@ std::vector parse_cache_config_file( return configs; } -std::set parse_buckets(const std::string& bucket_str) { - std::set buckets; +std::vector parse_buckets(const std::string& bucket_str) { + std::vector buckets; std::stringstream ss(bucket_str); while (ss.good()) { std::string bucket; getline(ss, bucket, ','); - buckets.insert(ParseUint64(bucket)); + buckets.push_back(ParseUint64(bucket)); } - buckets.insert(port::kMaxUint64); + buckets.push_back(port::kMaxUint64); return buckets; } @@ -1068,20 +1696,27 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { exit(1); } } - BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, - FLAGS_block_cache_analysis_result_dir, - std::move(cache_simulator)); + BlockCacheTraceAnalyzer analyzer( + FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir, + !FLAGS_reuse_distance_labels.empty(), std::move(cache_simulator)); Status s = analyzer.Analyze(); if (!s.IsIncomplete()) { // Read all traces. fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str()); exit(1); } + fprintf(stdout, "Status: %s\n", s.ToString().c_str()); analyzer.PrintStatsSummary(); if (FLAGS_print_access_count_stats) { print_break_lines(/*num_break_lines=*/3); - analyzer.PrintAccessCountStats(); + analyzer.PrintAccessCountStats( + /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks, + FLAGS_analyze_top_k_access_count_blocks); + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintAccessCountStats( + /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks, + FLAGS_analyze_top_k_access_count_blocks); } if (FLAGS_print_block_size_stats) { print_break_lines(/*num_break_lines=*/3); @@ -1099,13 +1734,36 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { while (ss.good()) { std::string label; getline(ss, label, ','); - analyzer.WriteAccessTimeline(label); + if (label.find("block") != std::string::npos) { + analyzer.WriteAccessTimeline(label, kSecondInMinute, true); + analyzer.WriteAccessTimeline(label, kSecondInMinute, false); + analyzer.WriteAccessTimeline(label, kSecondInHour, true); + analyzer.WriteAccessTimeline(label, kSecondInHour, false); + } else { + analyzer.WriteAccessTimeline(label, kSecondInMinute, false); + } } } + if (!FLAGS_analyze_callers.empty()) { + analyzer.WritePercentAccessSummaryStats(); + std::stringstream ss(FLAGS_analyze_callers); + while (ss.good()) { + std::string caller; + getline(ss, caller, ','); + analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller)); + } + } + + if (!FLAGS_access_count_buckets.empty()) { + std::vector buckets = parse_buckets(FLAGS_access_count_buckets); + analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true); + analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false); + } + if (!FLAGS_reuse_distance_labels.empty() && !FLAGS_reuse_distance_buckets.empty()) { - std::set buckets = parse_buckets(FLAGS_reuse_distance_buckets); + std::vector buckets = parse_buckets(FLAGS_reuse_distance_buckets); std::stringstream ss(FLAGS_reuse_distance_labels); while (ss.good()) { std::string label; @@ -1116,7 +1774,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { if (!FLAGS_reuse_interval_labels.empty() && !FLAGS_reuse_interval_buckets.empty()) { - std::set buckets = parse_buckets(FLAGS_reuse_interval_buckets); + std::vector buckets = parse_buckets(FLAGS_reuse_interval_buckets); std::stringstream ss(FLAGS_reuse_interval_labels); while (ss.good()) { std::string label; @@ -1124,6 +1782,43 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.WriteReuseInterval(label, buckets); } } + + if (!FLAGS_reuse_lifetime_labels.empty() && + !FLAGS_reuse_lifetime_buckets.empty()) { + std::vector buckets = parse_buckets(FLAGS_reuse_lifetime_buckets); + std::stringstream ss(FLAGS_reuse_lifetime_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseLifetime(label, buckets); + } + } + + if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) { + std::vector block_types{TraceType::kBlockTraceIndexBlock, + TraceType::kBlockTraceDataBlock, + TraceType::kBlockTraceFilterBlock}; + for (auto block_type : block_types) { + analyzer.WriteBlockReuseTimeline( + FLAGS_analyze_blocks_reuse_k_reuse_window, + /*user_access_only=*/true, block_type); + analyzer.WriteBlockReuseTimeline( + FLAGS_analyze_blocks_reuse_k_reuse_window, + /*user_access_only=*/false, block_type); + } + } + + if (!FLAGS_analyze_get_spatial_locality_labels.empty() && + !FLAGS_analyze_get_spatial_locality_buckets.empty()) { + std::vector buckets = + parse_buckets(FLAGS_analyze_get_spatial_locality_buckets); + std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteGetSpatialLocality(label, buckets); + } + } return 0; } diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 617b90280c9..feb7c21f22c 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -9,13 +9,13 @@ #include #include +#include "db/dbformat.h" #include "rocksdb/env.h" #include "rocksdb/utilities/sim_cache.h" #include "trace_replay/block_cache_tracer.h" #include "utilities/simulator_cache/cache_simulator.h" namespace rocksdb { - // Statistics of a block. struct BlockAccessInfo { uint64_t num_accesses = 0; @@ -23,11 +23,12 @@ struct BlockAccessInfo { uint64_t first_access_time = 0; uint64_t last_access_time = 0; uint64_t num_keys = 0; - std::map + std::map> key_num_access_map; // for keys exist in this block. - std::map + std::map> non_exist_key_num_access_map; // for keys do not exist in this block. uint64_t num_referenced_key_exist_in_block = 0; + uint64_t referenced_data_size = 0; std::map caller_num_access_map; // caller:timestamp:number_of_accesses. The granularity of the timestamp is // seconds. @@ -39,6 +40,12 @@ struct BlockAccessInfo { std::map reuse_distance_count; void AddAccess(const BlockCacheTraceRecord& access) { + if (block_size != 0 && access.block_size != 0) { + assert(block_size == access.block_size); + } + if (num_keys != 0 && access.num_keys_in_block != 0) { + assert(num_keys == access.num_keys_in_block); + } if (first_access_time == 0) { first_access_time = access.access_timestamp; } @@ -54,10 +61,18 @@ struct BlockAccessInfo { access.caller)) { num_keys = access.num_keys_in_block; if (access.referenced_key_exist_in_block == Boolean::kTrue) { - key_num_access_map[access.referenced_key]++; + if (key_num_access_map.find(access.referenced_key) == + key_num_access_map.end()) { + referenced_data_size += access.referenced_data_size; + } + key_num_access_map[access.referenced_key][access.caller]++; num_referenced_key_exist_in_block++; + if (referenced_data_size > block_size && block_size != 0) { + ParsedInternalKey internal_key; + ParseInternalKey(access.referenced_key, &internal_key); + } } else { - non_exist_key_num_access_map[access.referenced_key]++; + non_exist_key_num_access_map[access.referenced_key][access.caller]++; } } } @@ -83,6 +98,7 @@ class BlockCacheTraceAnalyzer { public: BlockCacheTraceAnalyzer( const std::string& trace_file_path, const std::string& output_dir, + bool compute_reuse_distance, std::unique_ptr&& cache_simulator); ~BlockCacheTraceAnalyzer() = default; // No copy and move. @@ -122,7 +138,8 @@ class BlockCacheTraceAnalyzer { // Print access count distribution and the distribution break down by block // type and column family. - void PrintAccessCountStats() const; + void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k, + uint32_t top_k) const; // Print data block accesses by user Get and Multi-Get. // It prints out 1) A histogram on the percentage of keys accessed in a data @@ -131,24 +148,93 @@ class BlockCacheTraceAnalyzer { // accesses on keys exist in a data block and its break down by column family. void PrintDataBlockAccessStats() const; + // Write the percentage of accesses break down by column family into a csv + // file saved in 'output_dir'. + // + // The file is named "percentage_of_accesses_summary". The file format is + // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in + // the trace. + void WritePercentAccessSummaryStats() const; + + // Write the percentage of accesses for the given caller break down by column + // family, level, and block type into a csv file saved in 'output_dir'. + // + // It generates two files: 1) caller_level_percentage_of_accesses_summary and + // 2) caller_bt_percentage_of_accesses_summary which break down by the level + // and block type, respectively. The file format is + // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in + // the trace. + void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const; + + // Write the access count summary into a csv file saved in 'output_dir'. + // It groups blocks by their access count. + // + // It generates two files: 1) cf_access_count_summary and 2) + // bt_access_count_summary which break down the access count by column family + // and block type, respectively. The file format is + // cf/bt,bucket_0,bucket_1,...,bucket_N. + void WriteAccessCountSummaryStats( + const std::vector& access_count_buckets, + bool user_access_only) const; + // Write miss ratio curves of simulated cache configurations into a csv file - // saved in 'output_dir'. + // named "mrc" saved in 'output_dir'. + // + // The file format is + // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses". void WriteMissRatioCurves() const; // Write the access timeline into a csv file saved in 'output_dir'. - void WriteAccessTimeline(const std::string& label) const; + // + // The file is named "label_access_timeline".The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique labels found in the trace. + void WriteAccessTimeline(const std::string& label, uint64_t time_unit, + bool user_access_only) const; // Write the reuse distance into a csv file saved in 'output_dir'. Reuse // distance is defined as the cumulated size of unique blocks read between two // consective accesses on the same block. + // + // The file is named "label_reuse_distance". The file format is + // bucket,label_1,label_2,...,label_N. void WriteReuseDistance(const std::string& label_str, - const std::set& distance_buckets) const; + const std::vector& distance_buckets) const; // Write the reuse interval into a csv file saved in 'output_dir'. Reuse // interval is defined as the time between two consecutive accesses on the - // same block.. + // same block. + // + // The file is named "label_reuse_interval". The file format is + // bucket,label_1,label_2,...,label_N. void WriteReuseInterval(const std::string& label_str, - const std::set& time_buckets) const; + const std::vector& time_buckets) const; + + // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse + // lifetime is defined as the time interval between the first access of a + // block and its last access. + // + // The file is named "label_reuse_lifetime". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseLifetime(const std::string& label_str, + const std::vector& time_buckets) const; + + // Write the reuse timeline into a csv file saved in 'output_dir'. + // + // The file is named + // "block_type_user_access_only_reuse_window_reuse_timeline". The file format + // is start_time,0,1,...,N where N equals trace_duration / reuse_window. + void WriteBlockReuseTimeline(uint64_t reuse_window, bool user_access_only, + TraceType block_type) const; + + // Write the Get spatical locality into csv files saved in 'output_dir'. + // + // It generates three csv files. label_percent_ref_keys, + // label_percent_accesses_on_ref_keys, and + // label_percent_data_size_on_ref_keys. + void WriteGetSpatialLocality( + const std::string& label_str, + const std::vector& percent_buckets) const; const std::map& TEST_cf_aggregates_map() const { @@ -161,28 +247,48 @@ class BlockCacheTraceAnalyzer { std::string BuildLabel(const std::set& labels, const std::string& cf_name, uint64_t fd, uint32_t level, TraceType type, - TableReaderCaller caller, - const std::string& block_key) const; + TableReaderCaller caller, uint64_t block_key) const; void ComputeReuseDistance(BlockAccessInfo* info) const; void RecordAccess(const BlockCacheTraceRecord& access); void UpdateReuseIntervalStats( - const std::string& label, const std::set& time_buckets, + const std::string& label, const std::vector& time_buckets, const std::map timeline, std::map>* label_time_num_reuses, uint64_t* total_num_reuses) const; + std::string OutputPercentAccessStats( + uint64_t total_accesses, + const std::map& cf_access_count) const; + + void WriteStatsToFile( + const std::string& label_str, const std::vector& time_buckets, + const std::string& filename_suffix, + const std::map>& label_data, + uint64_t ntotal) const; + + void TraverseBlocks( + std::function + block_callback) const; + rocksdb::Env* env_; const std::string trace_file_path_; const std::string output_dir_; + const bool compute_reuse_distance_; BlockCacheTraceHeader header_; std::unique_ptr cache_simulator_; std::map cf_aggregates_map_; std::map block_info_map_; + uint64_t trace_start_timestamp_in_seconds_ = 0; + uint64_t trace_end_timestamp_in_seconds_ = 0; }; int block_cache_trace_analyzer_tool(int argc, char** argv); diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index efb202cb4ab..45ef99eee75 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -56,6 +56,12 @@ class BlockCacheTracerTest : public testing::Test { reuse_distance_buckets_ = "1,1K,1M,1G"; reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; reuse_interval_buckets_ = "1,10,100,1000"; + reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; + reuse_lifetime_buckets_ = "1,10,100,1000"; + analyzing_callers_ = "Get,Iterator"; + access_count_buckets_ = "2,3,4,5,10"; + analyze_get_spatial_locality_labels_ = "all"; + analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100"; } ~BlockCacheTracerTest() override { @@ -158,12 +164,22 @@ class BlockCacheTracerTest : public testing::Test { "-print_access_count_stats", "-print_data_block_access_count_stats", "-cache_sim_warmup_seconds=0", + "-analyze_bottom_k_access_count_blocks=5", + "-analyze_top_k_access_count_blocks=5", + "-analyze_blocks_reuse_k_reuse_window=5", "-timeline_labels=" + timeline_labels_, "-reuse_distance_labels=" + reuse_distance_labels_, "-reuse_distance_buckets=" + reuse_distance_buckets_, "-reuse_interval_labels=" + reuse_interval_labels_, "-reuse_interval_buckets=" + reuse_interval_buckets_, - }; + "-reuse_lifetime_labels=" + reuse_lifetime_labels_, + "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_, + "-analyze_callers=" + analyzing_callers_, + "-access_count_buckets=" + access_count_buckets_, + "-analyze_get_spatial_locality_labels=" + + analyze_get_spatial_locality_labels_, + "-analyze_get_spatial_locality_buckets=" + + analyze_get_spatial_locality_buckets_}; char arg_buffer[kArgBufferSize]; char* argv[kMaxArgCount]; int argc = 0; @@ -189,6 +205,12 @@ class BlockCacheTracerTest : public testing::Test { std::string reuse_distance_buckets_; std::string reuse_interval_labels_; std::string reuse_interval_buckets_; + std::string reuse_lifetime_labels_; + std::string reuse_lifetime_buckets_; + std::string analyzing_callers_; + std::string access_count_buckets_; + std::string analyze_get_spatial_locality_labels_; + std::string analyze_get_spatial_locality_buckets_; }; TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { @@ -247,51 +269,65 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } { // Validate the timeline csv files. - const uint32_t expected_num_lines = 50; - std::stringstream ss(timeline_labels_); - while (ss.good()) { - std::string l; - ASSERT_TRUE(getline(ss, l, ',')); - const std::string timeline_file = - test_path_ + "/" + l + "_access_timeline"; - std::ifstream infile(timeline_file); - std::string line; - uint32_t nlines = 0; - ASSERT_TRUE(getline(infile, line)); - uint64_t expected_time = 1; - while (getline(infile, line)) { - std::stringstream ss_naccess(line); - uint32_t naccesses = 0; - std::string substr; - uint32_t time = 0; - while (ss_naccess.good()) { - ASSERT_TRUE(getline(ss_naccess, substr, ',')); - if (time == 0) { - time = ParseUint32(substr); - continue; + const std::vector time_units{"_60", "_3600"}; + const std::vector user_access_only_flags{"user_access_only_", + "all_access_"}; + for (auto const& user_access_only : user_access_only_flags) { + for (auto const& unit : time_units) { + std::stringstream ss(timeline_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + if (l.find("block") == std::string::npos) { + if (unit != "_60" || user_access_only != "all_access_") { + continue; + } } - naccesses += ParseUint32(substr); + const std::string timeline_file = test_path_ + "/" + + user_access_only + l + unit + + "_access_timeline"; + std::ifstream infile(timeline_file); + std::string line; + const uint64_t expected_naccesses = 50; + const uint64_t expected_user_accesses = 30; + ASSERT_TRUE(getline(infile, line)) << timeline_file; + uint32_t naccesses = 0; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + std::string substr; + bool read_label = false; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!read_label) { + read_label = true; + continue; + } + naccesses += ParseUint32(substr); + } + } + if (user_access_only == "user_access_only_") { + ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file; + } else { + ASSERT_EQ(expected_naccesses, naccesses) << timeline_file; + } + ASSERT_OK(env_->DeleteFile(timeline_file)); } - nlines++; - ASSERT_EQ(1, naccesses); - ASSERT_EQ(expected_time, time); - expected_time += 1; } - ASSERT_EQ(expected_num_lines, nlines); - ASSERT_OK(env_->DeleteFile(timeline_file)); } } { // Validate the reuse_interval and reuse_distance csv files. std::map test_reuse_csv_files; - test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_; test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_; + test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_; + test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_avg_reuse_interval_naccesses"] = + reuse_interval_labels_; for (auto const& test : test_reuse_csv_files) { const std::string& file_suffix = test.first; const std::string& labels = test.second; - const uint32_t expected_num_rows = 10; - const uint32_t expected_num_rows_absolute_values = 5; - const uint32_t expected_reused_blocks = 0; + const uint32_t expected_num_rows = 5; std::stringstream ss(labels); while (ss.good()) { std::string l; @@ -300,7 +336,6 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { std::ifstream infile(reuse_csv_file); std::string line; ASSERT_TRUE(getline(infile, line)); - uint32_t nblocks = 0; double npercentage = 0; uint32_t nrows = 0; while (getline(infile, line)) { @@ -314,20 +349,162 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { label_read = true; continue; } - if (nrows < expected_num_rows_absolute_values) { - nblocks += ParseUint32(substr); - } else { - npercentage += ParseDouble(substr); - } + npercentage += ParseDouble(substr); } } ASSERT_EQ(expected_num_rows, nrows); - ASSERT_EQ(expected_reused_blocks, nblocks); - ASSERT_LT(npercentage, 0); + if ("_reuse_lifetime" == test.first || + "_avg_reuse_interval" == test.first || + "_avg_reuse_interval_naccesses" == test.first) { + ASSERT_EQ(100, npercentage) << reuse_csv_file; + } else { + ASSERT_LT(npercentage, 0); + } ASSERT_OK(env_->DeleteFile(reuse_csv_file)); } } } + + { + // Validate the percentage of accesses summary. + const std::string percent_access_summary_file = + test_path_ + "/percentage_of_accesses_summary"; + std::ifstream infile(percent_access_summary_file); + std::string line; + ASSERT_TRUE(getline(infile, line)); + std::set callers; + std::set expected_callers{"Get", "MultiGet", "Iterator", + "Prefetch", "Compaction"}; + while (getline(infile, line)) { + std::stringstream caller_percent(line); + std::string caller; + ASSERT_TRUE(getline(caller_percent, caller, ',')); + std::string percent; + ASSERT_TRUE(getline(caller_percent, percent, ',')); + ASSERT_FALSE(caller_percent.good()); + callers.insert(caller); + ASSERT_EQ(20, ParseDouble(percent)); + } + ASSERT_EQ(expected_callers.size(), callers.size()); + for (auto caller : callers) { + ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end()); + } + ASSERT_OK(env_->DeleteFile(percent_access_summary_file)); + } + { + // Validate the percentage of accesses summary by analyzing callers. + std::stringstream analyzing_callers(analyzing_callers_); + while (analyzing_callers.good()) { + std::string caller; + ASSERT_TRUE(getline(analyzing_callers, caller, ',')); + std::vector breakdowns{"level", "bt"}; + for (auto breakdown : breakdowns) { + const std::string file_name = test_path_ + "/" + caller + "_" + + breakdown + + "_percentage_of_accesses_summary"; + std::ifstream infile(file_name); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum = 0; + while (getline(infile, line)) { + std::stringstream label_percent(line); + std::string label; + ASSERT_TRUE(getline(label_percent, label, ',')); + std::string percent; + ASSERT_TRUE(getline(label_percent, percent, ',')); + ASSERT_FALSE(label_percent.good()); + sum += ParseDouble(percent); + } + ASSERT_EQ(100, sum); + ASSERT_OK(env_->DeleteFile(file_name)); + } + } + } + const std::vector access_types{"user_access_only", "all_access"}; + const std::vector prefix{"bt", "cf"}; + for (auto const& pre : prefix) { + for (auto const& access_type : access_types) { + { + // Validate the access count summary. + const std::string bt_access_count_summary = test_path_ + "/" + pre + + "_" + access_type + + "_access_count_summary"; + std::ifstream infile(bt_access_count_summary); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum_percent = 0; + while (getline(infile, line)) { + std::stringstream bt_percent(line); + std::string bt; + ASSERT_TRUE(getline(bt_percent, bt, ',')); + std::string percent; + ASSERT_TRUE(getline(bt_percent, percent, ',')); + sum_percent += ParseDouble(percent); + } + ASSERT_EQ(100.0, sum_percent); + ASSERT_OK(env_->DeleteFile(bt_access_count_summary)); + } + } + } + for (auto const& access_type : access_types) { + std::vector block_types{"Index", "Data", "Filter"}; + for (auto block_type : block_types) { + // Validate reuse block timeline. + const std::string reuse_blocks_timeline = test_path_ + "/" + block_type + + "_" + access_type + + "_5_reuse_blocks_timeline"; + std::ifstream infile(reuse_blocks_timeline); + std::string line; + ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline; + uint32_t index = 0; + while (getline(infile, line)) { + std::stringstream timeline(line); + bool start_time = false; + double sum = 0; + while (timeline.good()) { + std::string value; + ASSERT_TRUE(getline(timeline, value, ',')); + if (!start_time) { + start_time = true; + continue; + } + sum += ParseDouble(value); + } + index++; + ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline; + } + ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline)); + } + } + + std::stringstream ss(analyze_get_spatial_locality_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::vector spatial_locality_files{ + "_percent_ref_keys", "_percent_accesses_on_ref_keys", + "_percent_data_size_on_ref_keys"}; + for (auto const& spatial_locality_file : spatial_locality_files) { + const std::string filename = test_path_ + "/" + l + spatial_locality_file; + std::ifstream infile(filename); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum_percent = 0; + uint32_t nrows = 0; + while (getline(infile, line)) { + std::stringstream bt_percent(line); + std::string bt; + ASSERT_TRUE(getline(bt_percent, bt, ',')); + std::string percent; + ASSERT_TRUE(getline(bt_percent, percent, ',')); + sum_percent += ParseDouble(percent); + nrows++; + } + ASSERT_EQ(11, nrows); + ASSERT_EQ(100.0, sum_percent); + ASSERT_OK(env_->DeleteFile(filename)); + } + } ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_)); } @@ -366,6 +543,7 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { // Read blocks. BlockCacheTraceAnalyzer analyzer(trace_file_path_, /*output_miss_ratio_curve_path=*/"", + /*compute_reuse_distance=*/true, /*simulator=*/nullptr); // The analyzer ends when it detects an incomplete access record. ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 62db942044c..a74dc4d58cb 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -29,6 +29,8 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) { } // namespace const uint64_t kMicrosInSecond = 1000 * 1000; +const uint64_t kSecondInMinute = 60; +const uint64_t kSecondInHour = 3600; const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = "UnknownColumnFamily"; const uint64_t BlockCacheTraceHelper::kReservedGetId = 0; diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 66cbb5adefa..3b26a18d639 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -17,6 +17,9 @@ namespace rocksdb { extern const uint64_t kMicrosInSecond; +extern const uint64_t kSecondInMinute; +extern const uint64_t kSecondInHour; + class BlockCacheTraceHelper { public: From 61876614dce8c9155e28d40b5d95ec1bf1cbfa47 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Fri, 12 Jul 2019 17:26:19 -0700 Subject: [PATCH 218/572] Fix MyRocks compile warnings-treated-as-errors on Fedora 30, gcc 9.1.1 (#5553) Summary: - Provide assignment operator in CompactionStats - Provide a copy constructor for FileDescriptor - Remove std::move from "return std::move(t)" in BoundedQueue Pull Request resolved: https://github.com/facebook/rocksdb/pull/5553 Differential Revision: D16230170 fbshipit-source-id: fd7c6e52390b2db1be24141e25649cf62424d078 --- db/internal_stats.h | 22 +++++++++++++++++++ db/version_edit.h | 2 ++ .../persistent_cache/persistent_cache_util.h | 2 +- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/db/internal_stats.h b/db/internal_stats.h index 20fb07f4853..ebe90d574d6 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -237,6 +237,28 @@ class InternalStats { } } + CompactionStats& operator=(const CompactionStats& c) { + micros = c.micros; + cpu_micros = c.cpu_micros; + bytes_read_non_output_levels = c.bytes_read_non_output_levels; + bytes_read_output_level = c.bytes_read_output_level; + bytes_written = c.bytes_written; + bytes_moved = c.bytes_moved; + num_input_files_in_non_output_levels = + c.num_input_files_in_non_output_levels; + num_input_files_in_output_level = c.num_input_files_in_output_level; + num_output_files = c.num_output_files; + num_input_records = c.num_input_records; + num_dropped_records = c.num_dropped_records; + count = c.count; + + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + return *this; + } + void Clear() { this->micros = 0; this->cpu_micros = 0; diff --git a/db/version_edit.h b/db/version_edit.h index e1857b37fc4..4a93db34e15 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -52,6 +52,8 @@ struct FileDescriptor { smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno) {} + FileDescriptor(const FileDescriptor& fd) { *this=fd; } + FileDescriptor& operator=(const FileDescriptor& fd) { table_reader = fd.table_reader; packed_number_and_path_id = fd.packed_number_and_path_id; diff --git a/utilities/persistent_cache/persistent_cache_util.h b/utilities/persistent_cache/persistent_cache_util.h index 214bb5875d6..254c038f985 100644 --- a/utilities/persistent_cache/persistent_cache_util.h +++ b/utilities/persistent_cache/persistent_cache_util.h @@ -48,7 +48,7 @@ class BoundedQueue { T t = std::move(q_.front()); size_ -= t.Size(); q_.pop_front(); - return std::move(t); + return t; } size_t Size() const { From 68d43b4d303d76836e0f2a4600de5de5e98fefea Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 12 Jul 2019 18:52:48 -0700 Subject: [PATCH 219/572] A python script to plot graphs for cvs files generated by block_cache_trace_analyzer Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5563 Test Plan: Manually run the script on files generated by block_cache_trace_analyzer. Differential Revision: D16214400 Pulled By: HaoyuHuang fbshipit-source-id: 94485eed995e9b2b63e197c5dfeb80129fa7897f --- tools/block_cache_trace_analyzer_plot.py | 403 +++++++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 tools/block_cache_trace_analyzer_plot.py diff --git a/tools/block_cache_trace_analyzer_plot.py b/tools/block_cache_trace_analyzer_plot.py new file mode 100644 index 00000000000..22d56b932c5 --- /dev/null +++ b/tools/block_cache_trace_analyzer_plot.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +import csv +import os +import random +import sys + +import matplotlib.backends.backend_pdf +import matplotlib.pyplot as plt +import numpy as np + + +# Make sure a legend has the same color across all generated graphs. +def get_cmap(n, name="hsv"): + """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct + RGB color; the keyword argument name must be a standard mpl colormap name.""" + return plt.cm.get_cmap(name, n) + + +color_index = 0 +bar_color_maps = {} +colors = [] +n_colors = 60 +linear_colors = get_cmap(n_colors) +for i in range(n_colors): + colors.append(linear_colors(i)) +# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate. +random.shuffle(colors) + + +def num_to_gb(n): + one_gb = 1024 * 1024 * 1024 + if float(n) % one_gb == 0: + return "{}".format(n / one_gb) + # Keep two decimal points. + return "{0:.2f}".format(float(n) / one_gb) + + +def plot_miss_ratio_graphs(csv_result_dir, output_result_dir): + mrc_file_path = csv_result_dir + "/mrc" + if not os.path.exists(mrc_file_path): + return + miss_ratios = {} + print("Processing file {}".format(mrc_file_path)) + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + is_header = False + for row in rows: + if not is_header: + is_header = True + continue + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(num_to_gb(capacity)) + miss_ratios[config]["y"].append(miss_ratio) + fig = plt.figure() + for config in miss_ratios: + plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config) + plt.xlabel("Cache capacity (GB)") + plt.ylabel("Miss Ratio (%)") + # plt.xscale('log', basex=2) + plt.ylim(ymin=0) + plt.title("RocksDB block cache miss ratios") + plt.legend() + fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight") + + +def sanitize(label): + # matplotlib cannot plot legends that is prefixed with "_" + # so we need to remove them here. + index = 0 + for i in range(len(label)): + if label[i] == "_": + index += 1 + else: + break + data = label[index:] + # The value of uint64_max in c++. + if "18446744073709551615" in data: + return "max" + return data + + +# Read the csv file vertically, i.e., group the data by columns. +def read_data_for_plot_vertical(csvfile): + x = [] + labels = [] + label_stats = {} + csv_rows = csv.reader(csvfile, delimiter=",") + data_rows = [] + for row in csv_rows: + data_rows.append(row) + # header + for i in range(1, len(data_rows[0])): + labels.append(sanitize(data_rows[0][i])) + label_stats[i - 1] = [] + for i in range(1, len(data_rows)): + for j in range(len(data_rows[i])): + if j == 0: + x.append(sanitize(data_rows[i][j])) + continue + label_stats[j - 1].append(float(data_rows[i][j])) + return x, labels, label_stats + + +# Read the csv file horizontally, i.e., group the data by rows. +def read_data_for_plot_horizontal(csvfile): + x = [] + labels = [] + label_stats = {} + csv_rows = csv.reader(csvfile, delimiter=",") + data_rows = [] + for row in csv_rows: + data_rows.append(row) + # header + for i in range(1, len(data_rows)): + labels.append(sanitize(data_rows[i][0])) + label_stats[i - 1] = [] + for i in range(1, len(data_rows[0])): + x.append(sanitize(data_rows[0][i])) + for i in range(1, len(data_rows)): + for j in range(len(data_rows[i])): + if j == 0: + # label + continue + label_stats[i - 1].append(float(data_rows[i][j])) + return x, labels, label_stats + + +def read_data_for_plot(csvfile, vertical): + if vertical: + return read_data_for_plot_vertical(csvfile) + return read_data_for_plot_horizontal(csvfile) + + +def plot_line_charts( + csv_result_dir, + output_result_dir, + filename_suffix, + pdf_name, + xlabel, + ylabel, + title, + vertical, + legend, +): + pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + print("Processing file {}".format(file)) + with open(csv_result_dir + "/" + file, "r") as csvfile: + x, labels, label_stats = read_data_for_plot(csvfile, vertical) + if len(x) == 0 or len(labels) == 0: + continue + # plot figure + fig = plt.figure() + for label_index in label_stats: + plt.plot( + [int(x[i]) for i in range(len(x))], + label_stats[label_index], + label=labels[label_index], + ) + + # Translate time unit into x labels. + if "_60" in file: + plt.xlabel("{} (Minute)".format(xlabel)) + if "_3600" in file: + plt.xlabel("{} (Hour)".format(xlabel)) + plt.ylabel(ylabel) + plt.title("{} {}".format(title, file)) + if legend: + plt.legend() + pdf.savefig(fig) + pdf.close() + + +def plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix, + pdf_name, + xlabel, + ylabel, + title, + vertical, + x_prefix, +): + global color_index, bar_color_maps, colors + pdf = matplotlib.backends.backend_pdf.PdfPages( + "{}/{}".format(output_result_dir, pdf_name) + ) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + with open(csv_result_dir + "/" + file, "r") as csvfile: + print("Processing file {}/{}".format(csv_result_dir, file)) + x, labels, label_stats = read_data_for_plot(csvfile, vertical) + if len(x) == 0 or len(label_stats) == 0: + continue + # Plot figure + fig = plt.figure() + ind = np.arange(len(x)) # the x locations for the groups + width = 0.5 # the width of the bars: can also be len(x) sequence + bars = [] + bottom_bars = [] + for _i in label_stats[0]: + bottom_bars.append(0) + for i in range(0, len(label_stats)): + # Assign a unique color to this label. + if labels[i] not in bar_color_maps: + bar_color_maps[labels[i]] = colors[color_index] + color_index += 1 + p = plt.bar( + ind, + label_stats[i], + width, + bottom=bottom_bars, + color=bar_color_maps[labels[i]], + ) + bars.append(p[0]) + for j in range(len(label_stats[i])): + bottom_bars[j] += label_stats[i][j] + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.xticks( + ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8 + ) + plt.legend(bars, labels) + plt.title("{} filename:{}".format(title, file)) + pdf.savefig(fig) + pdf.close() + + +def plot_access_timeline(csv_result_dir, output_result_dir): + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_suffix="access_timeline", + pdf_name="access_time.pdf", + xlabel="Time", + ylabel="Throughput", + title="Access timeline with group by label", + vertical=False, + legend=True, + ) + + +def plot_reuse_graphs(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="avg_reuse_interval_naccesses", + pdf_name="avg_reuse_interval_naccesses.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="Average reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="avg_reuse_interval", + pdf_name="avg_reuse_interval.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="Average reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="access_reuse_interval", + pdf_name="reuse_interval.pdf", + xlabel="Seconds", + ylabel="Percentage of accesses", + title="Reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="reuse_lifetime", + pdf_name="reuse_lifetime.pdf", + xlabel="Seconds", + ylabel="Percentage of blocks", + title="Reuse lifetime", + vertical=True, + x_prefix="< ", + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_suffix="reuse_blocks_timeline", + pdf_name="reuse_blocks_timeline.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="Reuse blocks timeline", + vertical=False, + legend=False, + ) + + +def plot_percentage_access_summary(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percentage_of_accesses_summary", + pdf_name="percentage_access.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_ref_keys", + pdf_name="percent_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_data_size_on_ref_keys", + pdf_name="percent_data_size_on_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_accesses_on_ref_keys", + pdf_name="percent_accesses_on_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + + +def plot_access_count_summary(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="access_count_summary", + pdf_name="access_count_summary.pdf", + xlabel="Access count", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="< ", + ) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Must provide two arguments: 1) The directory that saves a list of " + "directories which contain block cache trace analyzer result files " + "2) the directory to save plotted graphs." + ) + exit(1) + csv_result_dir = sys.argv[1] + output_result_dir = sys.argv[2] + print( + "Processing directory {} and save graphs to {}.".format( + csv_result_dir, output_result_dir + ) + ) + for csv_relative_dir in os.listdir(csv_result_dir): + csv_abs_dir = csv_result_dir + "/" + csv_relative_dir + result_dir = output_result_dir + "/" + csv_relative_dir + if not os.path.isdir(csv_abs_dir): + print("{} is not a directory".format(csv_abs_dir)) + continue + print("Processing experiment dir: {}".format(csv_relative_dir)) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + plot_miss_ratio_graphs(csv_abs_dir, result_dir) + plot_access_timeline(csv_abs_dir, result_dir) + plot_reuse_graphs(csv_abs_dir, result_dir) + plot_percentage_access_summary(csv_abs_dir, result_dir) + plot_access_count_summary(csv_abs_dir, result_dir) From f064d74e4549964566e1f9a5bf988bf94acbd5e1 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 15 Jul 2019 11:16:55 -0700 Subject: [PATCH 220/572] Cleanup the Arm64 CRC32 unused warning (#5565) Summary: When 'HAVE_ARM64_CRC' is set, the blew methods: - bool rocksdb::crc32c::isSSE42() - bool rocksdb::crc32c::isPCLMULQDQ() are defined but not used, the unused-function is raised when do rocksdb build. This patch try to cleanup these warnings by add ifndef, if it build under the HAVE_ARM64_CRC, we will not define `isSSE42` and `isPCLMULQDQ`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5565 Differential Revision: D16233654 fbshipit-source-id: c32a9dda7465dbf65f9ccafef159124db92cdffd --- util/crc32c.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/crc32c.cc b/util/crc32c.cc index e8d4116ff42..9e838b830f5 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -398,6 +398,8 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { return static_cast(l ^ 0xffffffffu); } +// Detect if ARM64 CRC or not. +#ifndef HAVE_ARM64_CRC // Detect if SS42 or not. #ifndef HAVE_POWER8 @@ -436,6 +438,7 @@ static bool isPCLMULQDQ() { } #endif // HAVE_POWER8 +#endif // HAVE_ARM64_CRC typedef uint32_t (*Function)(uint32_t, const char*, size_t); From b0259e45e0be576f98e31020975a8b1cef8fb31f Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Mon, 15 Jul 2019 11:39:18 -0700 Subject: [PATCH 221/572] add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5 --- db/db_impl/db_impl.cc | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index cf8dddb7fe1..6f2ebdc8098 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -14,6 +14,7 @@ #endif #include +#include #include #include #include @@ -697,10 +698,15 @@ void DBImpl::PersistStats() { if (!statistics->getTickerMap(&stats_map)) { return; } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); if (immutable_db_options_.persist_stats_to_disk) { WriteBatch batch; if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); for (const auto& stat : stats_map) { char key[100]; int length = @@ -722,8 +728,13 @@ void DBImpl::PersistStats() { Status s = Write(wo, &batch); if (!s.ok()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Writing to persistent stats CF failed -- %s\n", + "Writing to persistent stats CF failed -- %s", s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); } // TODO(Zhongyi): add purging for persisted data } else { @@ -736,6 +747,10 @@ void DBImpl::PersistStats() { stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; } } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); stats_history_[now_seconds] = stats_delta; } stats_slice_initialized_ = true; @@ -743,15 +758,22 @@ void DBImpl::PersistStats() { TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); // delete older stats snapshots to control memory consumption - bool purge_needed = - EstimateInMemoryStatsHistorySize() > stats_history_size_limit; + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); while (purge_needed && !stats_history_.empty()) { stats_history_.erase(stats_history_.begin()); purge_needed = EstimateInMemoryStatsHistorySize() > stats_history_size_limit; } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); } - // TODO: persist stats to disk #endif // !ROCKSDB_LITE } From 6e8a1354a799f14fb068fdecd771daa64918d36d Mon Sep 17 00:00:00 2001 From: Tomas Kolda Date: Mon, 15 Jul 2019 12:15:21 -0700 Subject: [PATCH 222/572] Fix regression - 100% CPU - Regression for Windows 7 (#5557) Summary: Fixes https://github.com/facebook/rocksdb/issues/5552 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5557 Differential Revision: D16266329 fbshipit-source-id: a8f6b50298a6f7c8d6c7e172bb26dd7eb6bd8a4d --- port/win/env_win.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 9abb14d67ea..7718ebd72c5 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -979,8 +979,7 @@ uint64_t WinEnvIO::NowMicros() { return li.QuadPart; } using namespace std::chrono; - return duration_cast( - high_resolution_clock::now().time_since_epoch()).count(); + return duration_cast(system_clock::now().time_since_epoch()).count(); } uint64_t WinEnvIO::NowNanos() { From cd2520361d38ef3556d3bda479fd7a4caa0d1168 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Mon, 15 Jul 2019 12:55:37 -0700 Subject: [PATCH 223/572] Fix memorty leak in `rocksdb_wal_iter_get_batch` function (#5515) Summary: `wal_batch.writeBatchPtr.release()` gives up the ownership of the original `WriteBatch`, but there is no new owner, which causes memory leak. The patch is simple. Removing `release()` prevent ownership change. `std::move` is for speed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5515 Differential Revision: D16264281 Pulled By: riversand963 fbshipit-source-id: 51c556b7a1c977325c3aa24acb636303847151fa --- db/c.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/c.cc b/db/c.cc index 17dc766dd66..4d40558f6b1 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1034,7 +1034,7 @@ void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) { rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) { rocksdb_writebatch_t* result = rocksdb_writebatch_create(); BatchResult wal_batch = iter->rep->GetBatch(); - result->rep = * wal_batch.writeBatchPtr.release(); + result->rep = std::move(*wal_batch.writeBatchPtr); if (seq != nullptr) { *seq = wal_batch.sequence; } From 3bde41b5a3f71a67cfee67d2a26244b80c777148 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 16 Jul 2019 13:11:23 -0700 Subject: [PATCH 224/572] Move the filter readers out of the block cache (#5504) Summary: Currently, when the block cache is used for the filter block, it is not really the block itself that is stored in the cache but a FilterBlockReader object. Since this object is not pure data (it has, for instance, pointers that might dangle, including in one case a back pointer to the TableReader), it's not really sharable. To avoid the issues around this, the current code erases the cache entries when the TableReader is closed (which, BTW, is not sufficient since a concurrent TableReader might have picked up the object in the meantime). Instead of doing this, the patch moves the FilterBlockReader out of the cache altogether, and decouples the filter reader object from the filter block. In particular, instead of the TableReader owning, or caching/pinning the FilterBlockReader (based on the customer's settings), with the change the TableReader unconditionally owns the FilterBlockReader, which in turn owns/caches/pins the filter block. This change also enables us to reuse the code paths historically used for data blocks for filters as well. Note: Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a separate phase. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504 Test Plan: make asan_check Differential Revision: D16036974 Pulled By: ltamasi fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091 --- CMakeLists.txt | 1 + HISTORY.md | 3 +- TARGETS | 1 + db/db_block_cache_test.cc | 14 +- src.mk | 1 + table/block_based/block_based_filter_block.cc | 178 +++-- table/block_based/block_based_filter_block.h | 45 +- .../block_based_filter_block_test.cc | 322 ++++++--- table/block_based/block_based_table_reader.cc | 626 +++++++----------- table/block_based/block_based_table_reader.h | 69 +- table/block_based/cachable_entry.h | 1 + table/block_based/filter_block.h | 66 +- .../block_based/filter_block_reader_common.cc | 90 +++ .../block_based/filter_block_reader_common.h | 54 ++ table/block_based/full_filter_block.cc | 156 +++-- table/block_based/full_filter_block.h | 62 +- table/block_based/full_filter_block_test.cc | 204 ++++-- table/block_based/partitioned_filter_block.cc | 303 +++++---- table/block_based/partitioned_filter_block.h | 68 +- .../partitioned_filter_block_test.cc | 118 ++-- table/table_reader.h | 3 +- table/table_test.cc | 6 +- tools/sst_dump_tool.cc | 3 +- 23 files changed, 1399 insertions(+), 995 deletions(-) create mode 100644 table/block_based/filter_block_reader_common.cc create mode 100644 table/block_based/filter_block_reader_common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c47f9811ef2..65904b8cae6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -595,6 +595,7 @@ set(SOURCES table/block_based/block_prefix_index.cc table/block_based/data_block_hash_index.cc table/block_based/data_block_footer.cc + table/block_based/filter_block_reader_common.cc table/block_based/flush_block_policy.cc table/block_based/full_filter_block.cc table/block_based/index_builder.cc diff --git a/HISTORY.md b/HISTORY.md index 099c9f37e86..2e1e03f68de 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,8 +6,9 @@ ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Index and filter blocks are now handled similarly to data blocks with regards to the block cache: instead of storing reader objects in the cache, only the blocks themselves are cached. In addition, index and filter blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any). * Partitions of partitioned indexes no longer affect the read amplification statistics. -* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* Due to the above refactoring, block cache eviction statistics for indexes and filters are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. diff --git a/TARGETS b/TARGETS index 6ef3da179dc..eda1051396d 100644 --- a/TARGETS +++ b/TARGETS @@ -192,6 +192,7 @@ cpp_library( "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", "table/block_based/data_block_hash_index.cc", + "table/block_based/filter_block_reader_common.cc", "table/block_based/flush_block_policy.cc", "table/block_based/full_filter_block.cc", "table/block_based/index_builder.cc", diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 8eb73a23dd7..77f37da0d45 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -365,11 +365,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); // set the cache capacity to the current usage cache->SetCapacity(index_bytes_insert + filter_bytes_insert); - // The index eviction statistics were broken by the refactoring that moved - // the index readers out of the block cache. Disabling these until we can + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can // bring the stats back. // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); // Note that the second key needs to be no longer than the first one. // Otherwise the second index block may not fit in cache. ASSERT_OK(Put(1, "key", "val")); @@ -380,13 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { index_bytes_insert); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), filter_bytes_insert); - // The index eviction statistics were broken by the refactoring that moved - // the index readers out of the block cache. Disabling these until we can + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can // bring the stats back. // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), // index_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), - filter_bytes_insert); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + // filter_bytes_insert); } namespace { diff --git a/src.mk b/src.mk index bc49b7ce074..fe930d5f49b 100644 --- a/src.mk +++ b/src.mk @@ -115,6 +115,7 @@ LIB_SOURCES = \ table/block_based/block_prefix_index.cc \ table/block_based/data_block_hash_index.cc \ table/block_based/data_block_footer.cc \ + table/block_based/filter_block_reader_common.cc \ table/block_based/flush_block_policy.cc \ table/block_based/full_filter_block.cc \ table/block_based/index_builder.cc \ diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc index e5a32e4635f..5585b8441c5 100644 --- a/table/block_based/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -13,6 +13,7 @@ #include "db/dbformat.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" #include "util/string_util.h" @@ -162,58 +163,120 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { } BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, bool _whole_key_filtering, - BlockContents&& contents, Statistics* stats) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - policy_(table_opt.filter_policy.get()), - prefix_extractor_(prefix_extractor), - data_(nullptr), - offset_(nullptr), - num_(0), - base_lg_(0), - contents_(std::move(contents)) { - assert(policy_); - size_t n = contents_.data.size(); - if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents_.data[n - 1]; - uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); - if (last_word > n - 5) return; - data_ = contents_.data.data(); - offset_ = data_ + last_word; - num_ = (n - 5 - last_word) / 4; + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + assert(table()); + assert(table()->get_rep()); + assert(table()->get_rep()->filter_policy); +} + +std::unique_ptr BlockBasedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new BlockBasedFilterBlockReader(table, std::move(filter_block))); } bool BlockBasedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key, block_offset); + return MayMatch(key, block_offset, no_io, get_context, lookup_context); } bool BlockBasedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - return MayMatch(prefix, block_offset); + return MayMatch(prefix, block_offset, no_io, get_context, lookup_context); +} + +bool BlockBasedFilterBlockReader::ParseFieldsFromBlock( + const BlockContents& contents, const char** data, const char** offset, + size_t* num, size_t* base_lg) { + assert(data); + assert(offset); + assert(num); + assert(base_lg); + + const size_t n = contents.data.size(); + if (n < 5) { // 1 byte for base_lg and 4 for start of offset array + return false; + } + + const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5); + if (last_word > n - 5) { + return false; + } + + *data = contents.data.data(); + *offset = (*data) + last_word; + *num = (n - 5 - last_word) / 4; + *base_lg = contents.data[n - 1]; + + return true; } -bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, - uint64_t block_offset) { - uint64_t index = block_offset >> base_lg_; - if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); - if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { - Slice filter = Slice(data_ + start, limit - start); - bool const may_match = policy_->KeyMayMatch(entry, filter); +bool BlockBasedFilterBlockReader::MayMatch( + const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return true; // Errors are treated as potential matches + } + + const uint64_t index = block_offset >> base_lg; + if (index < num) { + const uint32_t start = DecodeFixed32(offset + index * 4); + const uint32_t limit = DecodeFixed32(offset + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset - data)) { + const Slice filter = Slice(data + start, limit - start); + + assert(table()); + assert(table()->get_rep()); + const FilterPolicy* const policy = table()->get_rep()->filter_policy; + + const bool may_match = policy->KeyMayMatch(entry, filter); if (may_match) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; @@ -230,27 +293,54 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, } size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { - return num_ * 4 + 5 + (offset_ - data_); + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } std::string BlockBasedFilterBlockReader::ToString() const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + nullptr /* lookup_context */, &filter_block); + if (!s.ok()) { + return std::string("Unable to retrieve filter block"); + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return std::string("Error parsing filter block"); + } + std::string result; result.reserve(1024); std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); - AppendItem(&result, s_fb, rocksdb::ToString(num_)); + AppendItem(&result, s_fb, rocksdb::ToString(num)); AppendItem(&result, s_bo, s_hd); - for (size_t index = 0; index < num_; index++) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + for (size_t index = 0; index < num; index++) { + uint32_t start = DecodeFixed32(offset + index * 4); + uint32_t limit = DecodeFixed32(offset + index * 4 + 4); if (start != limit) { result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n"); - Slice filter = Slice(data_ + start, limit - start); + Slice filter = Slice(data + start, limit - start); AppendItem(&result, start, filter.ToString(true)); } } return result; } + } // namespace rocksdb diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h index cd86ff5c8a5..43dbc4f4f9f 100644 --- a/table/block_based/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -22,7 +22,8 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/block_based/filter_block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" #include "util/hash.h" namespace rocksdb { @@ -75,42 +76,42 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { // A FilterBlockReader is used to parse filter from SST table. // KeyMayMatch and PrefixMayMatch would trigger filter checking -class BlockBasedFilterBlockReader : public FilterBlockReader { +class BlockBasedFilterBlockReader + : public FilterBlockReaderCommon { public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - bool whole_key_filtering, - BlockContents&& contents, Statistics* statistics); + BlockBasedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + bool IsBlockBased() override { return true; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; size_t ApproximateMemoryUsage() const override; // convert this object to a human readable form std::string ToString() const override; private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - BlockContents contents_; + static bool ParseFieldsFromBlock(const BlockContents& contents, + const char** data, const char** offset, + size_t* num, size_t* base_lg); - bool MayMatch(const Slice& entry, uint64_t block_offset); - - // No copying allowed - BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); - void operator=(const BlockBasedFilterBlockReader&); + bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; }; + } // namespace rocksdb diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index 220888dd2fb..70bbde96ac8 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -10,6 +10,7 @@ #include "table/block_based/block_based_filter_block.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -41,28 +42,58 @@ class TestHashFilter : public FilterPolicy { } }; +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { + // Initialize what Open normally does as much as necessary for the test + rep->cache_key_prefix_size = 10; + } +}; + class FilterBlockTest : public testing::Test { public: - TestHashFilter policy_; + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; - - FilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); + InternalKeyComparator icomp_; + std::unique_ptr table_; + + FilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; + table_options_.filter_policy.reset(new TestHashFilter); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); } }; TEST_F(FilterBlockTest, EmptyBuilder) { BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - BlockContents block(builder.Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); + Slice slice(builder.Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); ASSERT_TRUE(reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FilterBlockTest, SingleChunk) { @@ -77,30 +108,46 @@ TEST_F(FilterBlockTest, SingleChunk) { builder.StartBlock(300); builder.Add("hello"); ASSERT_EQ(5, builder.NumAdded()); - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FilterBlockTest, MultiChunk) { @@ -123,93 +170,139 @@ TEST_F(FilterBlockTest, MultiChunk) { builder.Add("box"); builder.Add("hello"); - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); // Check first filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check second filter - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check third filter (empty) ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check last filter - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } // Test for block based filter block // use new interface in FilterPolicy to create filter builder/reader class BlockBasedFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; - - BlockBasedFilterBlockTest() { + InternalKeyComparator icomp_; + std::unique_ptr table_; + + BlockBasedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); - } - ~BlockBasedFilterBlockTest() override {} + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); + } }; TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(nullptr, table_options_); - BlockContents block(builder->Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; @@ -226,30 +319,42 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { builder->Add("box"); builder->StartBlock(300); builder->Add("hello"); - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; @@ -276,65 +381,86 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { builder->Add("box"); builder->Add("hello"); - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); // Check first filter ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check second filter ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check third filter (empty) ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check last filter ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 26c1365c4e7..a888603d72b 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -85,6 +85,8 @@ Status ReadBlockFromFile( const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, bool for_compaction = false) { + assert(result); + BlockContents contents; BlockFetcher block_fetcher( file, prefetch_buffer, footer, options, handle, &contents, ioptions, @@ -99,6 +101,32 @@ Status ReadBlockFromFile( return s; } +Status ReadBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr* result, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, BlockType block_type, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, + SequenceNumber /* global_seqno */, size_t /* read_amp_bytes_per_bit */, + MemoryAllocator* memory_allocator, bool for_compaction = false) { + assert(result); + + result->reset(new BlockContents); + + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, result->get(), ioptions, + do_uncompress, maybe_compressed, block_type, uncompression_dict, + cache_options, memory_allocator, nullptr, for_compaction); + + const Status s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + result->reset(); + } + + return s; +} + inline MemoryAllocator* GetMemoryAllocator( const BlockBasedTableOptions& table_options) { return table_options.block_cache.get() @@ -120,7 +148,6 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) { delete entry; } -void DeleteCachedFilterEntry(const Slice& key, void* value); void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); // Release the cached entry and decrement its ref count. @@ -283,8 +310,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -304,7 +332,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } } - *index_reader = new PartitionIndexReader(table, std::move(index_block)); + index_reader->reset( + new PartitionIndexReader(table, std::move(index_block))); return Status::OK(); } @@ -445,7 +474,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -472,8 +501,9 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -493,7 +523,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { } } - *index_reader = new BinarySearchIndexReader(table, std::move(index_block)); + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); return Status::OK(); } @@ -532,7 +563,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -552,8 +583,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_index_iter, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(index_reader != nullptr); assert(!pin || prefetch); @@ -579,8 +611,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = new HashIndexReader(table, std::move(index_block)); - *index_reader = new_index_reader; + index_reader->reset(new HashIndexReader(table, std::move(index_block))); // Get prefixes block BlockHandle prefixes_handle; @@ -636,7 +667,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { prefixes_meta_contents.data, &prefix_index); // TODO: log error if (s.ok()) { - new_index_reader->prefix_index_.reset(prefix_index); + HashIndexReader* const hash_index_reader = + static_cast(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); } return Status::OK(); @@ -679,7 +712,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else if (prefix_index_) { usage += prefix_index_->ApproximateMemoryUsage(); @@ -1453,22 +1486,49 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } - { - // Find compression dictionary handle - bool found_compression_dict; - s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, - &rep_->compression_dict_handle); + // Find compression dictionary handle + bool found_compression_dict = false; + s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, + &rep_->compression_dict_handle); + if (!s.ok()) { + return s; } BlockBasedTableOptions::IndexType index_type = rep_->index_type; const bool use_cache = table_options.cache_index_and_filter_blocks; + // pin both index and filters, down to all partitions + const bool pin_all = + rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + // prefetch the first level of index const bool prefetch_index = prefetch_all || (table_options.pin_top_level_index_and_filter && index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + + std::unique_ptr index_reader; + s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, lookup_context, + &index_reader); + if (!s.ok()) { + return s; + } + + rep_->index_reader = std::move(index_reader); + + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all) { + rep_->index_reader->CacheDependencies(pin_all); + } + // prefetch the first level of filter const bool prefetch_filter = prefetch_all || @@ -1476,83 +1536,36 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep_->filter_type == Rep::FilterType::kPartitionedFilter); // Partition fitlers cannot be enabled without partition indexes assert(!prefetch_filter || prefetch_index); - // pin both index and filters, down to all partitions - const bool pin_all = - rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; - // pin the first level of index - const bool pin_index = - pin_all || (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // pin the first level of filter const bool pin_filter = pin_all || (table_options.pin_top_level_index_and_filter && rep_->filter_type == Rep::FilterType::kPartitionedFilter); - IndexReader* index_reader = nullptr; - if (s.ok()) { - s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, - prefetch_index, pin_index, &index_reader, - lookup_context); - if (s.ok()) { - assert(index_reader != nullptr); - rep_->index_reader.reset(index_reader); - // The partitions of partitioned index are always stored in cache. They - // are hence follow the configuration for pin and prefetch regardless of - // the value of cache_index_and_filter_blocks + if (rep_->filter_policy) { + auto filter = new_table->CreateFilterBlockReader( + prefetch_buffer, use_cache, prefetch_filter, pin_filter, + lookup_context); + if (filter) { + // Refer to the comment above about paritioned indexes always being cached if (prefetch_all) { - rep_->index_reader->CacheDependencies(pin_all); + filter->CacheDependencies(pin_all); } - } else { - delete index_reader; - index_reader = nullptr; + + rep_->filter = std::move(filter); } } - // pre-fetching of blocks is turned on - // Will use block cache for meta-blocks access - // Always prefetch index and filter for level 0 // TODO(ajkr): also prefetch compression dictionary block // TODO(ajkr): also pin compression dictionary block when // `pin_l0_filter_and_index_blocks_in_cache == true`. - if (table_options.cache_index_and_filter_blocks) { - assert(table_options.block_cache != nullptr); - if (s.ok() && prefetch_filter) { - // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = - new_table->GetFilter(rep_->table_prefix_extractor.get(), - /*prefetch_buffer=*/nullptr, /*no_io=*/false, - /*get_context=*/nullptr, lookup_context); - if (filter_entry.GetValue() != nullptr && prefetch_all) { - filter_entry.GetValue()->CacheDependencies( - pin_all, rep_->table_prefix_extractor.get()); - } - // if pin_filter is true then save it in rep_->filter_entry; it will be - // released in the destructor only, hence it will be pinned in the - // cache while this reader is alive - if (pin_filter) { - rep_->filter_entry = std::move(filter_entry); - } - } - } else { + if (!table_options.cache_index_and_filter_blocks) { std::unique_ptr compression_dict_block; - if (s.ok()) { - // Set filter block - if (rep_->filter_policy) { - const bool is_a_filter_partition = true; - auto filter = new_table->ReadFilter( - prefetch_buffer, rep_->filter_handle, !is_a_filter_partition, - rep_->table_prefix_extractor.get()); - rep_->filter.reset(filter); - // Refer to the comment above about paritioned indexes always being - // cached - if (filter && prefetch_all) { - filter->CacheDependencies(pin_all, - rep_->table_prefix_extractor.get()); - } - } - s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); + s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); + if (!s.ok()) { + return s; } - if (s.ok() && !rep_->compression_dict_handle.IsNull()) { + + if (!rep_->compression_dict_handle.IsNull()) { assert(compression_dict_block != nullptr); // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy rep_->uncompression_dict.reset(new UncompressionDict( @@ -1560,6 +1573,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); } } + + assert(s.ok()); return s; } @@ -1631,10 +1646,43 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, return Status::OK(); } +template +class BlocklikeTraits; + +template <> +class BlocklikeTraits { + public: + static BlockContents* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } +}; + +template <> +class BlocklikeTraits { + public: + static Block* Create(BlockContents&& contents, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics) { + return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, + statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } +}; + +template Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, + const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, GetContext* get_context) const { const size_t read_amp_bytes_per_bit = @@ -1654,7 +1702,7 @@ Status BlockBasedTable::GetDataBlockFromCache( block_type, get_context); if (cache_handle != nullptr) { block->SetCachedValue( - reinterpret_cast(block_cache->Value(cache_handle)), + reinterpret_cast(block_cache->Value(cache_handle)), block_cache, cache_handle); return s; } @@ -1698,16 +1746,17 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { - std::unique_ptr block_holder( - new Block(std::move(contents), rep_->get_global_seqno(block_type), - read_amp_bytes_per_bit, statistics)); // uncompressed block + std::unique_ptr block_holder( + BlocklikeTraits::Create( + std::move(contents), rep_->get_global_seqno(block_type), + read_amp_bytes_per_bit, statistics)); // uncompressed block if (block_cache != nullptr && block_holder->own_bytes() && read_options.fill_cache) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle); + &DeleteCachedEntry, &cache_handle); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1730,10 +1779,11 @@ Status BlockBasedTable::GetDataBlockFromCache( return s; } +template Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, BlockContents* raw_block_contents, + CachableEntry* cached_block, BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, MemoryAllocator* memory_allocator, BlockType block_type, @@ -1757,7 +1807,7 @@ Status BlockBasedTable::PutDataBlockToCache( Status s; Statistics* statistics = ioptions.statistics; - std::unique_ptr block_holder; + std::unique_ptr block_holder; if (raw_block_comp_type != kNoCompression) { // Retrieve the uncompressed contents into a new buffer BlockContents uncompressed_block_contents; @@ -1771,11 +1821,13 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } - block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no, - read_amp_bytes_per_bit, statistics)); + block_holder.reset(BlocklikeTraits::Create( + std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit, + statistics)); } else { - block_holder.reset(new Block(std::move(*raw_block_contents), seq_no, - read_amp_bytes_per_bit, statistics)); + block_holder.reset(BlocklikeTraits::Create( + std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit, + statistics)); } // Insert compressed block into compressed block cache. @@ -1809,7 +1861,8 @@ Status BlockBasedTable::PutDataBlockToCache( size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle, priority); + &DeleteCachedEntry, &cache_handle, + priority); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1829,171 +1882,36 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } -FilterBlockReader* BlockBasedTable::ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor) const { +std::unique_ptr BlockBasedTable::CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { auto& rep = rep_; - // TODO: We might want to unify with ReadBlockFromFile() if we start - // requiring checksum verification in Table::Open. - if (rep->filter_type == Rep::FilterType::kNoFilter) { - return nullptr; - } - BlockContents block; - - BlockFetcher block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), - filter_handle, &block, rep->ioptions, false /* decompress */, - false /*maybe_compressed*/, BlockType::kFilter, - UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, - GetMemoryAllocator(rep->table_options)); - Status s = block_fetcher.ReadBlockContents(); - - if (!s.ok()) { - // Error reading the block - return nullptr; + auto filter_type = rep->filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + return std::unique_ptr(); } assert(rep->filter_policy); - auto filter_type = rep->filter_type; - if (rep->filter_type == Rep::FilterType::kPartitionedFilter && - is_a_filter_partition) { - filter_type = Rep::FilterType::kFullFilter; - } - switch (filter_type) { - case Rep::FilterType::kPartitionedFilter: { - return new PartitionedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), nullptr, - rep->ioptions.statistics, rep->internal_comparator, this, - rep_->index_key_includes_seq, rep_->index_value_is_full); - } + case Rep::FilterType::kPartitionedFilter: + return PartitionedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); case Rep::FilterType::kBlockFilter: - return new BlockBasedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->table_options, rep->whole_key_filtering, std::move(block), - rep->ioptions.statistics); - - case Rep::FilterType::kFullFilter: { - auto filter_bits_reader = - rep->filter_policy->GetFilterBitsReader(block.data); - assert(filter_bits_reader != nullptr); - return new FullFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), filter_bits_reader, - rep->ioptions.statistics); - } + return BlockBasedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kFullFilter: + return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context); default: // filter_type is either kNoFilter (exited the function at the first if), // or it must be covered in this switch block assert(false); - return nullptr; - } -} - -CachableEntry BlockBasedTable::GetFilter( - const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const { - const BlockHandle& filter_blk_handle = rep_->filter_handle; - const bool is_a_filter_partition = true; - return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition, - no_io, get_context, lookup_context, prefix_extractor); -} - -CachableEntry BlockBasedTable::GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - const SliceTransform* prefix_extractor) const { - // If cache_index_and_filter_blocks is false, filter should be pre-populated. - // We will return rep_->filter anyway. rep_->filter can be nullptr if filter - // read fails at Open() time. We don't want to reload again since it will - // most probably fail again. - if (!is_a_filter_partition && - !rep_->table_options.cache_index_and_filter_blocks) { - return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr, - /*own_value=*/false}; - } - - Cache* block_cache = rep_->table_options.block_cache.get(); - if (rep_->filter_policy == nullptr /* do not use filter */ || - block_cache == nullptr /* no block cache at all */) { - return CachableEntry(); - } - - if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { - return {rep_->filter_entry.GetValue(), /*cache=*/nullptr, - /*cache_handle=*/nullptr, /*own_value=*/false}; - } - - PERF_TIMER_GUARD(read_filter_block_nanos); - - // Fetching from the cache - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - filter_blk_handle, cache_key); - - Cache::Handle* cache_handle = - GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context); - - FilterBlockReader* filter = nullptr; - size_t usage = 0; - bool is_cache_hit = false; - bool return_empty_reader = false; - if (cache_handle != nullptr) { - filter = - reinterpret_cast(block_cache->Value(cache_handle)); - usage = filter->ApproximateMemoryUsage(); - is_cache_hit = true; - } else if (no_io) { - // Do not invoke any io. - return_empty_reader = true; - } else { - filter = ReadFilter(prefetch_buffer, filter_blk_handle, - is_a_filter_partition, prefix_extractor); - if (filter != nullptr) { - usage = filter->ApproximateMemoryUsage(); - Status s = block_cache->Insert( - key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - if (s.ok()) { - UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage); - } else { - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); - delete filter; - return_empty_reader = true; - } - } - } - - if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && - lookup_context) { - // Avoid making copy of block_key and cf_name when constructing the access - // record. - BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), - /*block_key=*/"", TraceType::kBlockTraceFilterBlock, - /*block_size=*/usage, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io, lookup_context->get_id); - block_cache_tracer_->WriteBlockAccess(access_record, key, - rep_->cf_name_for_tracing(), - /*referenced_key=*/nullptr); + return std::unique_ptr(); } - - if (return_empty_reader) { - return CachableEntry(); - } - return {filter, cache_handle ? block_cache : nullptr, cache_handle, - /*own_value=*/false}; } CachableEntry BlockBasedTable::GetUncompressionDict( @@ -2178,6 +2096,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } block.TransferTo(iter); + return iter; } @@ -2294,10 +2213,11 @@ Status BlockBasedTable::GetDataBlockFromCache( // If contents is non-null, it skips the cache lookup and disk read, since // the caller has already read it. In both cases, if ro.fill_cache is true, // it inserts the block into the block cache. +template Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const { assert(block_entry != nullptr); @@ -2347,17 +2267,18 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { no_insert = false; Statistics* statistics = rep_->ioptions.statistics; - bool do_decompress = - block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; + const bool maybe_compressed = + block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed && !block_cache_compressed; CompressionType raw_block_comp_type; BlockContents raw_block_contents; if (!contents) { StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, - &raw_block_contents, rep_->ioptions, - do_decompress /* do uncompress */, rep_->blocks_maybe_compressed, - block_type, uncompression_dict, rep_->persistent_cache_options, + &raw_block_contents, rep_->ioptions, do_uncompress, + maybe_compressed, block_type, uncompression_dict, + rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), GetMemoryAllocatorForCompressedBlock(rep_->table_options)); s = block_fetcher.ReadBlockContents(); @@ -2387,21 +2308,25 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( uint64_t nkeys = 0; if (block_entry->GetValue()) { // Approximate the number of keys in the block using restarts. - nkeys = rep_->table_options.block_restart_interval * - block_entry->GetValue()->NumRestarts(); + nkeys = + rep_->table_options.block_restart_interval * + BlocklikeTraits::GetNumRestarts(*block_entry->GetValue()); usage = block_entry->GetValue()->ApproximateMemoryUsage(); } TraceType trace_block_type = TraceType::kTraceMax; switch (block_type) { - case BlockType::kIndex: - trace_block_type = TraceType::kBlockTraceIndexBlock; - break; case BlockType::kData: trace_block_type = TraceType::kBlockTraceDataBlock; break; + case BlockType::kFilter: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; case BlockType::kRangeDeletion: trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; default: // This cannot happen. assert(false); @@ -2603,10 +2528,11 @@ void BlockBasedTable::MaybeLoadBlocksToCache( } } +template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction) const { assert(block_entry); @@ -2639,16 +2565,19 @@ Status BlockBasedTable::RetrieveBlock( return Status::Incomplete("no blocking io"); } - std::unique_ptr block; + const bool maybe_compressed = + block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed; + std::unique_ptr block; { StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, - rep_->ioptions, rep_->blocks_maybe_compressed, - rep_->blocks_maybe_compressed, block_type, uncompression_dict, - rep_->persistent_cache_options, rep_->get_global_seqno(block_type), + rep_->ioptions, do_uncompress, maybe_compressed, block_type, + uncompression_dict, rep_->persistent_cache_options, + rep_->get_global_seqno(block_type), block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0, @@ -2665,6 +2594,22 @@ Status BlockBasedTable::RetrieveBlock( return s; } +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction) const; + +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction) const; + BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, std::unordered_map>* block_map) @@ -2733,10 +2678,7 @@ bool BlockBasedTable::PrefixMayMatch( Status s; // First, try check with full filter - auto filter_entry = - GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false, - /*get_context=*/nullptr, lookup_context); - FilterBlockReader* filter = filter_entry.GetValue(); + FilterBlockReader* const filter = rep_->filter.get(); bool filter_checked = true; if (filter != nullptr) { if (!filter->IsBlockBased()) { @@ -2798,7 +2740,7 @@ bool BlockBasedTable::PrefixMayMatch( BlockHandle handle = iiter->value().handle; may_match = filter->PrefixMayMatch( prefix, prefix_extractor, handle.offset(), /*no_io=*/false, - /*const_key_ptr=*/nullptr, lookup_context); + /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context); } } } @@ -3273,7 +3215,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( bool BlockBasedTable::FullFilterKeyMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, const Slice& internal_key, const bool no_io, - const SliceTransform* prefix_extractor, + const SliceTransform* prefix_extractor, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { if (filter == nullptr || filter->IsBlockBased()) { return true; @@ -3281,20 +3223,21 @@ bool BlockBasedTable::FullFilterKeyMayMatch( Slice user_key = ExtractUserKey(internal_key); const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; - if (filter->whole_key_filtering()) { + if (rep_->whole_key_filtering) { size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, - no_io, const_ikey_ptr, lookup_context); + no_io, const_ikey_ptr, get_context, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && prefix_extractor->InDomain(user_key) && !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), - prefix_extractor, kNotValid, false, - const_ikey_ptr, lookup_context)) { + prefix_extractor, kNotValid, no_io, + const_ikey_ptr, get_context, + lookup_context)) { may_match = false; } if (may_match) { @@ -3312,7 +3255,7 @@ void BlockBasedTable::FullFilterKeysMayMatch( if (filter == nullptr || filter->IsBlockBased()) { return; } - if (filter->whole_key_filtering()) { + if (rep_->whole_key_filtering) { filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && @@ -3338,25 +3281,19 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, assert(get_context != nullptr); Status s; const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - bool may_match; - FilterBlockReader* filter = nullptr; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block uint64_t tracing_get_id = get_context->get_tracing_get_id(); BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, tracing_get_id}; - { - if (!skip_filters) { - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, - read_options.read_tier == kBlockCacheTier, - get_context, &lookup_context); - } - filter = filter_entry.GetValue(); + const bool may_match = + FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, + get_context, &lookup_context); - // First check the full filter - // If full filter not useful, Then go into each block - may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, - prefix_extractor, &lookup_context); - } if (!may_match) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); @@ -3388,7 +3325,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), prefix_extractor, v.handle.offset(), no_io, - /*const_ikey_ptr=*/nullptr, &lookup_context); + /*const_ikey_ptr=*/nullptr, get_context, + &lookup_context); if (not_exist_in_filter) { // Not found @@ -3510,31 +3448,23 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { - const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - FilterBlockReader* filter = nullptr; + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; MultiGetRange sst_file_range(*mget_range, mget_range->begin(), mget_range->end()); + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet, tracing_mget_id}; - if (!skip_filters) { - { - // TODO: Figure out where the stats should go - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, - read_options.read_tier == kBlockCacheTier, - /*get_context=*/nullptr, &lookup_context); - } - filter = filter_entry.GetValue(); + FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, + prefix_extractor, &lookup_context); - // First check the full filter - // If full filter not useful, Then go into each block - FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, - prefix_extractor, &lookup_context); - } if (skip_filters || !sst_file_range.empty()) { IndexBlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable @@ -4006,7 +3936,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr> iiter(NewIndexIterator( options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, - /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); iiter->Seek(key); assert(iiter->Valid()); @@ -4022,8 +3952,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, Status BlockBasedTable::CreateIndexReader( FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, - bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. @@ -4033,14 +3963,14 @@ Status BlockBasedTable::CreateIndexReader( switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader, - lookup_context); + prefetch, pin, lookup_context, + index_reader); } case BlockBasedTableOptions::kBinarySearch: case BlockBasedTableOptions::kBinarySearchWithFirstKey: { return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader, - lookup_context); + prefetch, pin, lookup_context, + index_reader); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -4056,14 +3986,14 @@ Status BlockBasedTable::CreateIndexReader( " Fall back to binary search index."); return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, - index_reader, lookup_context); + lookup_context, index_reader); } meta_index_iter = meta_iter_guard.get(); } return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, - use_cache, prefetch, pin, index_reader, - lookup_context); + use_cache, prefetch, pin, lookup_context, + index_reader); } default: { std::string error_message = @@ -4079,7 +4009,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, std::unique_ptr> index_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_contex=*/&context)); + /*lookup_context=*/&context)); index_iter->Seek(key); uint64_t result; @@ -4102,8 +4032,9 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, return result; } -bool BlockBasedTable::TEST_filter_block_preloaded() const { - return rep_->filter != nullptr; +bool BlockBasedTable::TEST_FilterBlockInCache() const { + assert(rep_ != nullptr); + return TEST_BlockInCache(rep_->filter_handle); } bool BlockBasedTable::TEST_IndexBlockInCache() const { @@ -4167,8 +4098,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( return Status::OK(); } -Status BlockBasedTable::DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor) { +Status BlockBasedTable::DumpTable(WritableFile* out_file) { // Output Footer out_file->Append( "Footer Details:\n" @@ -4225,36 +4155,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file, " "); out_file->Append(table_properties->ToString("\n ", ": ").c_str()); out_file->Append("\n"); - - // Output Filter blocks - if (!rep_->filter && !table_properties->filter_policy_name.empty()) { - // Support only BloomFilter as off now - rocksdb::BlockBasedTableOptions table_options; - table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); - if (table_properties->filter_policy_name.compare( - table_options.filter_policy->Name()) == 0) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(table_properties->filter_policy_name); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { - BlockContents block; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, - ReadOptions(), handle, &block, rep_->ioptions, - false /*decompress*/, false /*maybe_compressed*/, - BlockType::kFilter, UncompressionDict::GetEmptyDict(), - rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - rep_->filter.reset(new BlockBasedFilterBlockReader( - prefix_extractor, table_options, - table_options.whole_key_filtering, std::move(block), - rep_->ioptions.statistics)); - } - } - } - } } + if (rep_->filter) { out_file->Append( "Filter Details:\n" @@ -4318,22 +4220,17 @@ void BlockBasedTable::Close() { return; } - Cache* const cache = rep_->table_options.block_cache.get(); - // cleanup index, filter, and compression dictionary blocks // to avoid accessing dangling pointers if (!rep_->table_options.no_block_cache) { - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - - // Get the filter block key - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->filter_handle, cache_key); - cache->Erase(key); - if (!rep_->compression_dict_handle.IsNull()) { // Get the compression dictionary block key - key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->compression_dict_handle, cache_key); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->compression_dict_handle, cache_key); + + Cache* const cache = rep_->table_options.block_cache.get(); cache->Erase(key); } } @@ -4518,15 +4415,6 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, namespace { -void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { - FilterBlockReader* filter = reinterpret_cast(value); - if (filter->statistics() != nullptr) { - RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, - filter->ApproximateMemoryUsage()); - } - delete filter; -} - void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { UncompressionDict* dict = reinterpret_cast(value); RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 750700813d3..189cd5d2e3a 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -172,8 +172,7 @@ class BlockBasedTable : public TableReader { size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form - Status DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor = nullptr) override; + Status DumpTable(WritableFile* out_file) override; Status VerifyChecksum(TableReaderCaller caller) override; @@ -181,7 +180,7 @@ class BlockBasedTable : public TableReader { ~BlockBasedTable(); - bool TEST_filter_block_preloaded() const; + bool TEST_FilterBlockInCache() const; bool TEST_IndexBlockInCache() const; // IndexReader is the interface that provides the functionality for index @@ -241,6 +240,8 @@ class BlockBasedTable : public TableReader { class PartitionedIndexIteratorState; + template + friend class FilterBlockReaderCommon; friend class PartitionIndexReader; protected: @@ -278,21 +279,23 @@ class BlockBasedTable : public TableReader { // @param block_entry value is set to the uncompressed block if found. If // in uncompressed block cache, also sets cache_handle to reference that // block. + template Status MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). + template Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, + CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction = false) const; @@ -310,19 +313,6 @@ class BlockBasedTable : public TableReader { CachableEntry, MultiGetContext::MAX_BATCH_SIZE>* results, char* scratch, const UncompressionDict& uncompression_dict) const; - // For the following two functions: - // if `no_io == true`, we will not try to read filter/index from sst file - // were they not present in cache yet. - CachableEntry GetFilter( - const SliceTransform* prefix_extractor, - FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const; - virtual CachableEntry GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - const SliceTransform* prefix_extractor) const; - CachableEntry GetUncompressionDict( FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; @@ -348,12 +338,13 @@ class BlockBasedTable : public TableReader { // pointer to the block as well as its block handle. // @param uncompression_dict Data for presetting the compression library's // dictionary. + template Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, + const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context = nullptr) const; + GetContext* get_context) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -365,11 +356,12 @@ class BlockBasedTable : public TableReader { // PutDataBlockToCache(). After the call, the object will be invalid. // @param uncompression_dict Data for presetting the compression library's // dictionary. + template Status PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, + CachableEntry* cached_block, + BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, MemoryAllocator* memory_allocator, BlockType block_type, GetContext* get_context) const; @@ -387,13 +379,14 @@ class BlockBasedTable : public TableReader { Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, - IndexReader** index_reader, - BlockCacheLookupContext* lookup_context); + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); bool FullFilterKeyMayMatch(const ReadOptions& read_options, FilterBlockReader* filter, const Slice& user_key, const bool no_io, const SliceTransform* prefix_extractor, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const; void FullFilterKeysMayMatch(const ReadOptions& read_options, @@ -435,10 +428,9 @@ class BlockBasedTable : public TableReader { Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. - virtual FilterBlockReader* ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor = nullptr) const; + std::unique_ptr CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); static void SetupCacheKeyPrefix(Rep* rep); @@ -516,17 +508,7 @@ struct BlockBasedTable::Rep { // Footer contains the fixed table information Footer footer; - // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr) - // and used only when options.block_cache is nullptr or when - // `cache_index_and_filter_blocks == false`. Otherwise, we will get the - // filter and compression dictionary blocks via the block cache. In that case, - // `filter_handle`, and `compression_dict_handle` are used to lookup these - // meta-blocks in block cache. - // - // Note: the IndexReader object is always stored in this member variable; - // the index block itself, however, may or may not be in the block cache - // based on the settings above. We plan to change the handling of the - // filter and compression dictionary similarly. + std::unique_ptr index_reader; std::unique_ptr filter; std::unique_ptr uncompression_dict; @@ -553,13 +535,6 @@ struct BlockBasedTable::Rep { std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor; - // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is - // true or in all levels when pin_top_level_index_and_filter is set in - // combination with partitioned filters: then we do use the LRU cache, - // but we always keep the filter block's handle checked out here (=we - // don't call Release()), plus the parsed out objects the LRU cache will never - // push flush them out, hence they're pinned - CachableEntry filter_entry; std::shared_ptr fragmented_range_dels; // If global_seqno is used, all Keys in this file will have the same diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 5b5d16ef318..b4cd6ec6757 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -10,6 +10,7 @@ #pragma once #include +#include "port/likely.h" #include "rocksdb/cache.h" #include "rocksdb/cleanable.h" diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index d54de5ae1ab..936281bde65 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -38,6 +38,7 @@ namespace rocksdb { const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; +class GetContext; using MultiGetRange = MultiGetContext::Range; // A FilterBlockBuilder is used to construct all of the filters for a @@ -78,16 +79,14 @@ class FilterBlockBuilder { // BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - explicit FilterBlockReader() - : whole_key_filtering_(true), size_(0), statistics_(nullptr) {} - explicit FilterBlockReader(size_t s, Statistics* stats, - bool _whole_key_filtering) - : whole_key_filtering_(_whole_key_filtering), - size_(s), - statistics_(stats) {} - virtual ~FilterBlockReader() {} + FilterBlockReader() = default; + virtual ~FilterBlockReader() = default; + + FilterBlockReader(const FilterBlockReader&) = delete; + FilterBlockReader& operator=(const FilterBlockReader&) = delete; virtual bool IsBlockBased() = 0; // If is blockbased filter + /** * If no_io is set, then it returns true if it cannot answer the query without * reading data from disk. This is used in PartitionedFilterBlockReader to @@ -102,17 +101,19 @@ class FilterBlockReader { const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) = 0; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) { + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, - context)) { + get_context, lookup_context)) { range->SkipKey(iter); } } @@ -125,27 +126,26 @@ class FilterBlockReader { const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) = 0; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) { + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey, context)) { + block_offset, no_io, &ikey, get_context, + lookup_context)) { range->SkipKey(iter); } } } virtual size_t ApproximateMemoryUsage() const = 0; - virtual size_t size() const { return size_; } - virtual Statistics* statistics() const { return statistics_; } - - bool whole_key_filtering() const { return whole_key_filtering_; } // convert this object to a human readable form virtual std::string ToString() const { @@ -153,30 +153,22 @@ class FilterBlockReader { return error_msg; } - virtual void CacheDependencies(bool /*pin*/, - const SliceTransform* /*prefix_extractor*/) {} + virtual void CacheDependencies(bool /*pin*/) {} - virtual bool RangeMayExist( - const Slice* /*iterate_upper_bound*/, const Slice& user_key, - const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, - const Slice* const const_ikey_ptr, bool* filter_checked, - bool /*need_upper_bound_check*/, BlockCacheLookupContext* context) { + virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, + const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, + bool* filter_checked, + bool /*need_upper_bound_check*/, + BlockCacheLookupContext* lookup_context) { *filter_checked = true; Slice prefix = prefix_extractor->Transform(user_key); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr, context); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } - - protected: - bool whole_key_filtering_; - - private: - // No copying allowed - FilterBlockReader(const FilterBlockReader&); - void operator=(const FilterBlockReader&); - size_t size_; - Statistics* statistics_; - int level_ = -1; }; } // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc new file mode 100644 index 00000000000..717a4ad0dff --- /dev/null +++ b/table/block_based/filter_block_reader_common.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/filter_block_reader_common.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" + +namespace rocksdb { + +template +Status FilterBlockReaderCommon::ReadFilterBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) { + PERF_TIMER_GUARD(read_filter_block_nanos); + + assert(table); + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + + const Status s = + table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context); + + return s; +} + +template +const SliceTransform* +FilterBlockReaderCommon::table_prefix_extractor() const { + assert(table_); + + const BlockBasedTable::Rep* const rep = table_->get_rep(); + assert(rep); + + return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr; +} + +template +bool FilterBlockReaderCommon::whole_key_filtering() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->whole_key_filtering; +} + +template +Status FilterBlockReaderCommon::GetOrReadFilterBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(filter_block); + + if (!filter_block_.IsEmpty()) { + filter_block->SetUnownedValue(filter_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, + get_context, lookup_context, filter_block); +} + +template +size_t FilterBlockReaderCommon::ApproximateFilterBlockMemoryUsage() + const { + assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr); + return filter_block_.GetOwnValue() + ? filter_block_.GetValue()->ApproximateMemoryUsage() + : 0; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; + +} // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h new file mode 100644 index 00000000000..3698d3f1e91 --- /dev/null +++ b/table/block_based/filter_block_reader_common.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" + +namespace rocksdb { + +class BlockBasedTable; +class FilePrefetchBuffer; + +// Encapsulates common functionality for the various filter block reader +// implementations. Provides access to the filter block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +template +class FilterBlockReaderCommon : public FilterBlockReader { + public: + FilterBlockReaderCommon(const BlockBasedTable* t, + CachableEntry&& filter_block) + : table_(t), filter_block_(std::move(filter_block)) { + assert(table_); + } + + protected: + static Status ReadFilterBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block); + + const BlockBasedTable* table() const { return table_; } + const SliceTransform* table_prefix_extractor() const; + bool whole_key_filtering() const; + + Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + + size_t ApproximateFilterBlockMemoryUsage() const; + + private: + const BlockBasedTable* table_; + CachableEntry filter_block_; +}; + +} // namespace rocksdb diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 6d2b9d70a50..553bd37d974 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -16,6 +16,7 @@ #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" namespace rocksdb { @@ -98,59 +99,91 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, } FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - const Slice& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FilterBlockReader(contents.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - contents_(contents) { - assert(filter_bits_reader != nullptr); - filter_bits_reader_.reset(filter_bits_reader); - if (prefix_extractor_ != nullptr) { + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { full_length_enabled_ = - prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); } } -FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FullFilterBlockReader(prefix_extractor, _whole_key_filtering, - contents.data, filter_bits_reader, stats) { - block_contents_ = std::move(contents); -} - bool FullFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key); + return MayMatch(key, no_io, get_context, lookup_context); +} + +std::unique_ptr FullFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new FullFilterBlockReader(table, std::move(filter_block))); } bool FullFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - return MayMatch(prefix); + return MayMatch(prefix, no_io, get_context, lookup_context); } -bool FullFilterBlockReader::MayMatch(const Slice& entry) { - if (contents_.size() != 0) { - if (filter_bits_reader_->MayMatch(entry)) { +bool FullFilterBlockReader::MayMatch( + const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + if (filter_block.GetValue()->data.size() != 0) { + assert(table()); + assert(table()->get_rep()); + + std::unique_ptr filter_bits_reader( + table()->get_rep()->filter_policy->GetFilterBitsReader( + filter_block.GetValue()->data)); + assert(filter_bits_reader != nullptr); + + if (filter_bits_reader->MayMatch(entry)) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; } else { @@ -163,38 +196,58 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { // Simply return. Don't skip any key - consider all keys as likely to be // present return; } - MayMatch(range); + MayMatch(range, no_io, lookup_context); } void FullFilterBlockReader::PrefixesMayMatch( MultiGetRange* range, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - MayMatch(range); + MayMatch(range, no_io, lookup_context); } -void FullFilterBlockReader::MayMatch(MultiGetRange* range) { - if (contents_.size() == 0) { +void FullFilterBlockReader::MayMatch( + MultiGetRange* range, bool no_io, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block); + if (!s.ok()) { return; } + assert(filter_block.GetValue()); + + if (filter_block.GetValue()->data.size() == 0) { + return; + } + + assert(table()); + assert(table()->get_rep()); + + std::unique_ptr filter_bits_reader( + table()->get_rep()->filter_policy->GetFilterBitsReader( + filter_block.GetValue()->data)); + assert(filter_bits_reader != nullptr); + // We need to use an array instead of autovector for may_match since // &may_match[0] doesn't work for autovector (compiler error). So // declare both keys and may_match as arrays, which is also slightly less @@ -205,7 +258,7 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) { for (auto iter = range->begin(); iter != range->end(); ++iter) { keys[num_keys++] = &iter->ukey; } - filter_bits_reader_->MayMatch(num_keys, &keys[0], &may_match[0]); + filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); int i = 0; for (auto iter = range->begin(); iter != range->end(); ++iter) { @@ -217,13 +270,11 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) { } size_t FullFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = block_contents_.usable_size(); + size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); - usage += malloc_usable_size(filter_bits_reader_.get()); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); - usage += sizeof(*filter_bits_reader_.get()); #endif // ROCKSDB_MALLOC_USABLE_SIZE return usage; } @@ -232,7 +283,7 @@ bool FullFilterBlockReader::RangeMayExist( const Slice* iterate_upper_bound, const Slice& user_key, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check, BlockCacheLookupContext* context) { + bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { *filter_checked = false; return true; @@ -245,22 +296,23 @@ bool FullFilterBlockReader::RangeMayExist( } else { *filter_checked = true; return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr, context); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } } bool FullFilterBlockReader::IsFilterCompatible( const Slice* iterate_upper_bound, const Slice& prefix, - const Comparator* comparator) { + const Comparator* comparator) const { // Try to reuse the bloom filter in the SST table if prefix_extractor in // mutable_cf_options has changed. If range [user_key, upper_bound) all // share the same prefix then we may still be able to use the bloom filter. - if (iterate_upper_bound != nullptr && prefix_extractor_) { - if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (iterate_upper_bound != nullptr && prefix_extractor) { + if (!prefix_extractor->InDomain(*iterate_upper_bound)) { return false; } - Slice upper_bound_xform = - prefix_extractor_->Transform(*iterate_upper_bound); + Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); // first check if user_key and upper_bound all share the same prefix if (!comparator->Equal(prefix, upper_bound_xform)) { // second check if user_key's prefix is the immediate predecessor of diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 99e5299b34f..08a41706e6b 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -15,7 +15,8 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/block_based/filter_block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" #include "util/hash.h" namespace rocksdb { @@ -78,71 +79,58 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { // A FilterBlockReader is used to parse filter from SST table. // KeyMayMatch and PrefixMayMatch would trigger filter checking -class FullFilterBlockReader : public FilterBlockReader { +class FullFilterBlockReader : public FilterBlockReaderCommon { public: - // REQUIRES: "contents" and filter_bits_reader must stay live - // while *this is live. - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - const Slice& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - BlockContents&& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); - - // bits_reader is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockReader() override {} + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; size_t ApproximateMemoryUsage() const override; bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; + + private: + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + void MayMatch(MultiGetRange* range, bool no_io, + BlockCacheLookupContext* lookup_context) const; + bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const; private: - const SliceTransform* prefix_extractor_; - Slice contents_; - std::unique_ptr filter_bits_reader_; - BlockContents block_contents_; bool full_length_enabled_; size_t prefix_extractor_full_length_; - - // No copying allowed - FullFilterBlockReader(const FullFilterBlockReader&); - bool MayMatch(const Slice& entry); - void MayMatch(MultiGetRange* range); - void operator=(const FullFilterBlockReader&); - bool IsFilterCompatible(const Slice* iterate_upper_bound, - const Slice& prefix, const Comparator* comparator); - }; } // namespace rocksdb diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 57ff158c5c7..e8fcce07d75 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -6,6 +6,7 @@ #include "table/block_based/full_filter_block.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "table/full_filter_bits_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -40,6 +41,15 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { std::vector hash_entries_; }; +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { + // Initialize what Open normally does as much as necessary for the test + rep->cache_key_prefix_size = 10; + } +}; + class TestFilterBitsReader : public FilterBitsReader { public: explicit TestFilterBitsReader(const Slice& contents) @@ -95,26 +105,46 @@ class TestHashFilter : public FilterPolicy { class PluginFullFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - PluginFullFilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); + PluginFullFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; + table_options_.filter_policy.reset(new TestHashFilter); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); } }; TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { FullFilterBlockBuilder builder( nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + FullFilterBlockReader reader(table_.get(), std::move(block)); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { @@ -125,57 +155,90 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { builder.Add("box"); builder.Add("box"); builder.Add("hello"); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice = builder.Finish(); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } class FullFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - FullFilterBlockTest() { + FullFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - } - ~FullFilterBlockTest() override {} + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); + } }; TEST_F(FullFilterBlockTest, EmptyBuilder) { FullFilterBlockBuilder builder( nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FullFilterBlockTest, DuplicateEntries) { @@ -221,31 +284,46 @@ TEST_F(FullFilterBlockTest, SingleChunk) { builder.Add("box"); builder.Add("hello"); ASSERT_EQ(5, builder.NumAdded()); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice = builder.Finish(); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index dcd985152bb..ae57e85dca6 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -119,113 +119,77 @@ Slice PartitionedFilterBlockBuilder::Finish( } PartitionedFilterBlockReader::PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - comparator_(comparator), - table_(table), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - idx_on_fltr_blk_.reset(new Block(std::move(contents), - kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, stats)); -} + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} -PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { - // TODO(myabandeh): if instead of filter object we store only the blocks in - // block cache, then we don't have to manually earse them from block cache - // here. - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (UNLIKELY(block_cache == nullptr)) { - return; - } - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - IndexBlockIter biter; - BlockHandle handle; - Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); - biter.SeekToFirst(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value().handle; - auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, - table_->rep_->cache_key_prefix_size, - handle, cache_key); - block_cache->Erase(key); +std::unique_ptr PartitionedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } } + + return std::unique_ptr( + new PartitionedFilterBlockReader(table, std::move(filter_block))); } bool PartitionedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) { + GetContext* get_context, BlockCacheLookupContext* lookup_context) { assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); - if (!whole_key_filtering_) { - return true; - } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range - return false; - } - auto filter_partition = - GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, - prefix_extractor, context); - if (UNLIKELY(!filter_partition.GetValue())) { + if (!whole_key_filtering()) { return true; } - return filter_partition.GetValue()->KeyMayMatch( - key, prefix_extractor, block_offset, no_io, /*const_ikey_ptr=*/nullptr, - context); + + return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::KeyMayMatch); } bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) { + GetContext* get_context, BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); - if (!prefix_extractor_ && !prefix_extractor) { + if (!table_prefix_extractor() && !prefix_extractor) { return true; } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range - return false; - } - auto filter_partition = - GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, - prefix_extractor, context); - if (UNLIKELY(!filter_partition.GetValue())) { - return true; - } - return filter_partition.GetValue()->PrefixMayMatch( - prefix, prefix_extractor, kNotValid, no_io, /*const_ikey_ptr=*/nullptr, - context); + + return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::PrefixMayMatch); } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( - const Slice& entry) { + const CachableEntry& filter_block, const Slice& entry) const { IndexBlockIter iter; + const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &iter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { return BlockHandle(0, 0); @@ -235,39 +199,78 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( return fltr_blk_handle; } -CachableEntry -PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, - const bool no_io, const SliceTransform* prefix_extractor, - BlockCacheLookupContext* context) { - const bool is_a_filter_partition = true; - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (LIKELY(block_cache != nullptr)) { - if (filter_map_.size() != 0) { - auto iter = filter_map_.find(fltr_blk_handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (iter != filter_map_.end()) { - return {iter->second.GetValue(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; - } +Status PartitionedFilterBlockReader::GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(table()); + assert(filter_block); + assert(filter_block->IsEmpty()); + + if (!filter_map_.empty()) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + filter_block->SetUnownedValue(iter->second.GetValue()); + return Status::OK(); } - return table_->GetFilter(/*prefetch_buffer=*/nullptr, fltr_blk_handle, - is_a_filter_partition, no_io, - /*get_context=*/nullptr, context, - prefix_extractor); - } else { - auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, - is_a_filter_partition, prefix_extractor); - return {filter, nullptr /* cache */, nullptr /* cache_handle */, - true /* own_value */}; } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + const Status s = + table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context); + + return s; +} + +bool PartitionedFilterBlockReader::MayMatch( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const { + CachableEntry filter_block; + Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + CachableEntry filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + return (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, + lookup_context); } size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = idx_on_fltr_blk_->usable_size(); + size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -276,16 +279,36 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -void PartitionedFilterBlockReader::CacheDependencies( - bool pin, const SliceTransform* prefix_extractor) { - // Before read partitions, prefetch them to avoid lots of IOs +void PartitionedFilterBlockReader::CacheDependencies(bool pin) { + assert(table()); + + const BlockBasedTable::Rep* const rep = table()->get_rep(); + assert(rep); + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + + CachableEntry filter_block; + + Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &filter_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return; + } + + // Before read partitions, prefetch them to avoid lots of IOs + assert(filter_block.GetValue()); + IndexBlockIter biter; + const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &biter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -298,27 +321,55 @@ void PartitionedFilterBlockReader::CacheDependencies( uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - auto& file = table_->rep_->file; + prefetch_buffer.reset(new FilePrefetchBuffer()); - Status s; - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, static_cast(prefetch_len)); // After prefetch, read the partitions one by one - biter.SeekToFirst(); - for (; biter.Valid(); biter.Next()) { + ReadOptions read_options; + for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; - const bool no_io = true; - const bool is_a_filter_partition = true; - auto filter = table_->GetFilter( - prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, - /*get_context=*/nullptr, &lookup_context, prefix_extractor); - if (LIKELY(filter.IsCached())) { - if (pin) { - filter_map_[handle.offset()] = std::move(filter); + + CachableEntry block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), read_options, handle, + UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter, + nullptr /* get_context */, &lookup_context, nullptr /* contents */); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + filter_map_[handle.offset()] = std::move(block); + } } } } } +const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() + const { + assert(table()); + assert(table()->get_rep()); + + return &table()->get_rep()->internal_comparator; +} + +bool PartitionedFilterBlockReader::index_key_includes_seq() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_key_includes_seq; +} + +bool PartitionedFilterBlockReader::index_value_is_full() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_value_is_full; +} + } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 4b0fb523d0d..b73ae3baa75 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -14,8 +14,7 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/block_based/block.h" -#include "table/block_based/block_based_table_reader.h" -#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block_reader_common.h" #include "table/block_based/full_filter_block.h" #include "util/autovector.h" @@ -69,44 +68,57 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { BlockHandle last_encoded_handle_; }; -class PartitionedFilterBlockReader : public FilterBlockReader { +class PartitionedFilterBlockReader : public FilterBlockReaderCommon { public: - explicit PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full); - ~PartitionedFilterBlockReader() override; + PartitionedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; private: - BlockHandle GetFilterPartitionHandle(const Slice& entry); - CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, - const bool no_io, const SliceTransform* prefix_extractor, - BlockCacheLookupContext* context); - void CacheDependencies(bool bin, - const SliceTransform* prefix_extractor) override; - - const SliceTransform* prefix_extractor_; - std::unique_ptr idx_on_fltr_blk_; - const InternalKeyComparator comparator_; - const BlockBasedTable* table_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; - std::unordered_map> filter_map_; + BlockHandle GetFilterPartitionHandle(const CachableEntry& filter_block, + const Slice& entry) const; + Status GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + + using FilterFunction = bool (FullFilterBlockReader::*)( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context); + bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const; + void CacheDependencies(bool pin) override; + + const InternalKeyComparator* internal_comparator() const; + bool index_key_includes_seq() const; + bool index_value_is_full() const; + + protected: + std::unordered_map> filter_map_; }; } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 34ecfa4ac65..5e9e467723c 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -7,6 +7,7 @@ #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/partitioned_filter_block.h" #include "table/full_filter_bits_builder.h" @@ -23,34 +24,29 @@ std::map slices; class MockedBlockBasedTable : public BlockBasedTable { public: - explicit MockedBlockBasedTable(Rep* rep) + MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test rep->cache_key_prefix_size = 10; + rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); + rep->index_value_is_full = !pib->get_use_value_delta_encoding(); } +}; - CachableEntry GetFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, bool /* unused */, GetContext* /* unused */, - BlockCacheLookupContext* /*context*/, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return {obj, nullptr /* cache */, nullptr /* cache_handle */, - true /* own_value */}; - } - - FilterBlockReader* ReadFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return obj; +class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { + public: + MyPartitionedFilterBlockReader(BlockBasedTable* t, + CachableEntry&& filter_block) + : PartitionedFilterBlockReader(t, std::move(filter_block)) { + for (const auto& pair : slices) { + const uint64_t offset = pair.first; + const Slice& slice = pair.second; + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, + nullptr /* cache_handle */, true /* own_value */); + filter_map_[offset] = std::move(block); + } } }; @@ -58,10 +54,18 @@ class PartitionedFilterBlockTest : public testing::Test, virtual public ::testing::WithParamInterface { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; - InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator()); + InternalKeyComparator icomp_; + std::unique_ptr table_; + std::shared_ptr cache_; - PartitionedFilterBlockTest() { + PartitionedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close // will access variable that are not @@ -70,7 +74,6 @@ class PartitionedFilterBlockTest table_options_.index_block_restart_interval = 3; } - std::shared_ptr cache_; ~PartitionedFilterBlockTest() override {} const std::string keys[4] = {"afoo", "bar", "box", "hello"}; @@ -110,7 +113,7 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* NewIndexBuilder() { const bool kValueDeltaEncoded = true; return PartitionedIndexBuilder::CreateIndexBuilder( - &icomp, !kValueDeltaEncoded, table_options_); + &icomp_, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -131,11 +134,8 @@ class PartitionedFilterBlockTest p_index_builder, partition_size); } - std::unique_ptr table; - PartitionedFilterBlockReader* NewReader( - PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, - const SliceTransform* prefix_extractor) { + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { BlockHandle bh; Status status; Slice slice; @@ -143,19 +143,21 @@ class PartitionedFilterBlockTest slice = builder->Finish(bh, &status); bh = Write(slice); } while (status.IsIncomplete()); - const Options options; - const ImmutableCFOptions ioptions(options); - const MutableCFOptions moptions(options); - const EnvOptions env_options; - const bool kSkipFilters = true; - const bool kImmortal = true; - table.reset(new MockedBlockBasedTable( - new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp, - !kSkipFilters, 0, !kImmortal))); - auto reader = new PartitionedFilterBlockReader( - prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp, - table.get(), pib->seperator_is_key_plus_seq(), - !pib->get_use_value_delta_encoding()); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockedBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table), + pib)); + BlockContents contents(slice); + CachableEntry block( + new Block(std::move(contents), kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, nullptr), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + auto reader = + new MyPartitionedFilterBlockReader(table_.get(), std::move(block)); return reader; } @@ -163,36 +165,37 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* pib, bool empty = false, const SliceTransform* prefix_extractor = nullptr) { std::unique_ptr reader( - NewReader(builder, pib, prefix_extractor)); + NewReader(builder, pib)); // Querying added keys const bool no_io = true; for (auto key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, - &ikey_slice, /*context=*/nullptr)); + &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } // querying missing keys for (auto key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { - ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } else { // assuming a good hash function - ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_FALSE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } } } @@ -336,13 +339,14 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { builder->Add(pkeys[2]); CutABlock(pib.get(), pkeys[2]); std::unique_ptr reader( - NewReader(builder.get(), pib.get(), prefix_extractor.get())); + NewReader(builder.get(), pib.get())); for (auto key : pkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->PrefixMayMatch( prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, - /*no_io=*/false, &ikey_slice, /*context=*/nullptr)); + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } } diff --git a/table/table_reader.h b/table/table_reader.h index 1c879cb1f81..72d11a7bd24 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -116,8 +116,7 @@ class TableReader { } // convert db file to a human readable form - virtual Status DumpTable(WritableFile* /*out_file*/, - const SliceTransform* /*prefix_extractor*/) { + virtual Status DumpTable(WritableFile* /*out_file*/) { return Status::NotSupported("DumpTable() not supported"); } diff --git a/table/table_test.cc b/table/table_test.cc index c3a1f82ed37..c54933b781a 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2296,7 +2296,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { // preloading filter/index blocks is enabled. auto reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); ASSERT_FALSE(reader->TEST_IndexBlockInCache()); { @@ -2343,7 +2343,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. auto* reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); ASSERT_TRUE(reader->TEST_IndexBlockInCache()); // -- PART 1: Open with regular block cache. @@ -2476,7 +2476,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { MutableCFOptions moptions4(options); ASSERT_OK(c3.Reopen(ioptions4, moptions4)); reader = dynamic_cast(c3.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 260d15f303c..44a733b57c6 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -150,8 +150,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) { std::unique_ptr out_file; Env* env = Env::Default(); env->NewWritableFile(out_filename, &out_file, soptions_); - Status s = table_reader_->DumpTable(out_file.get(), - moptions_.prefix_extractor.get()); + Status s = table_reader_->DumpTable(out_file.get()); out_file->Close(); return s; } From 0acaa1a8464f35d0f4cf83a1bafbad662bfe0c99 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Tue, 16 Jul 2019 15:19:45 -0700 Subject: [PATCH 225/572] WriteUnPrepared: use tracked_keys_ to track keys needed for rollback (#5562) Summary: Currently, we are tracking keys we need to rollback via a separate structure specific to WriteUnprepared in write_set_keys_. We already have a data structure called tracked_keys_ used to track which keys to unlock on transaction termination. This is exactly what we want, since we should only rollback keys that we have locked anyway. Save some memory by reusing that data structure instead of making our own. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5562 Differential Revision: D16206484 Pulled By: lth fbshipit-source-id: 5894d2b824a4b19062d84adbd6e6e86f00047488 --- utilities/transactions/transaction_base.h | 12 +- .../transactions/write_unprepared_txn.cc | 119 ++++++++++++++---- utilities/transactions/write_unprepared_txn.h | 22 ++-- .../transactions/write_unprepared_txn_db.cc | 36 +----- .../transactions/write_unprepared_txn_db.h | 27 ---- 5 files changed, 111 insertions(+), 105 deletions(-) diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 04274866aab..26efd51b378 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -317,6 +317,12 @@ class TransactionBaseImpl : public Transaction { // Records writes pending in this transaction WriteBatchWithIndex write_batch_; + // Map from column_family_id to map of keys that are involved in this + // transaction. + // For Pessimistic Transactions this is the list of locked keys. + // Optimistic Transactions will wait till commit time to do conflict checking. + TransactionKeyMap tracked_keys_; + private: friend class WritePreparedTxn; // Extra data to be persisted with the commit. Note this is only used when @@ -327,12 +333,6 @@ class TransactionBaseImpl : public Transaction { // nullptr if there was no snapshot at the time SetSavePoint() was called. std::unique_ptr>> save_points_; - // Map from column_family_id to map of keys that are involved in this - // transaction. - // For Pessimistic Transactions this is the list of locked keys. - // Optimistic Transactions will wait till commit time to do conflict checking. - TransactionKeyMap tracked_keys_; - // If true, future Put/Merge/Deletes will be indexed in the // WriteBatchWithIndex. // If false, future Put/Merge/Deletes will be inserted directly into the diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 54d478c9466..d127220e47d 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -42,7 +42,9 @@ SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber( WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, const WriteOptions& write_options, const TransactionOptions& txn_options) - : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) { + : WritePreparedTxn(txn_db, write_options, txn_options), + wupt_db_(txn_db), + recovered_txn_(false) { max_write_batch_size_ = txn_options.max_write_batch_size; // We set max bytes to zero so that we don't get a memory limit error. // Instead of trying to keep write batch strictly under the size limit, we @@ -69,6 +71,12 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() { log_number_); } } + + // Call tracked_keys_.clear() so that ~PessimisticTransaction does not + // try to unlock keys for recovered transactions. + if (recovered_txn_) { + tracked_keys_.clear(); + } } void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { @@ -76,7 +84,7 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { max_write_batch_size_ = txn_options.max_write_batch_size; write_batch_.SetMaxBytes(0); unprep_seqs_.clear(); - write_set_keys_.clear(); + recovered_txn_ = false; } Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, @@ -148,6 +156,72 @@ Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked); } +// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For +// WriteUnprepared, the write batches have already been written into the +// database during WAL replay, so all we have to do is just to "retrack" the key +// so that rollbacks are possible. +// +// Calling TryLock instead of TrackKey is also possible, but as an optimization, +// recovered transactions do not hold locks on their keys. This follows the +// implementation in PessimisticTransactionDB::Initialize where we set +// skip_concurrency_control to true. +Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) { + struct TrackKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + } + return Status::OK(); + } + + // Recovered batches do not contain 2PC markers. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + TrackKeyHandler handler(this, + wupt_db_->txn_db_options_.rollback_merge_operands); + return wb->Iterate(&handler); +} + Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { const bool kPrepared = true; Status s; @@ -159,25 +233,11 @@ Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { return s; } -void WriteUnpreparedTxn::UpdateWriteKeySet(uint32_t cfid, const Slice& key) { - // TODO(lth): write_set_keys_ can just be a std::string instead of a vector. - write_set_keys_[cfid].push_back(key.ToString()); -} - Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { if (name_.empty()) { return Status::InvalidArgument("Cannot write to DB without SetName."); } - // Update write_key_set_ for rollback purposes. - KeySetBuilder keyset_handler( - this, wupt_db_->txn_db_options_.rollback_merge_operands); - auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&keyset_handler); - assert(s.ok()); - if (!s.ok()) { - return s; - } - // TODO(lth): Reduce duplicate code with WritePrepared prepare logic. WriteOptions write_options = write_options_; write_options.disableWAL = false; @@ -204,10 +264,10 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { // WriteImpl should not overwrite that value, so set log_used to nullptr if // log_number_ is already set. uint64_t* log_used = log_number_ ? nullptr : &log_number_; - s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), - /*callback*/ nullptr, log_used, /*log ref*/ - 0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_, - &add_prepared_callback); + auto s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, log_used, /*log ref*/ + 0, !DISABLE_MEMTABLE, &seq_used, + prepare_batch_cnt_, &add_prepared_callback); assert(!s.ok() || seq_used != kMaxSequenceNumber); auto prepare_seq = seq_used; @@ -317,7 +377,6 @@ Status WriteUnpreparedTxn::CommitInternal() { wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); } unprep_seqs_.clear(); - write_set_keys_.clear(); return s; } // else do the 2nd write to publish seq @@ -349,7 +408,6 @@ Status WriteUnpreparedTxn::CommitInternal() { wpt_db_->RemovePrepared(seq.first, seq.second); } unprep_seqs_.clear(); - write_set_keys_.clear(); return s; } @@ -359,19 +417,21 @@ Status WriteUnpreparedTxn::RollbackInternal() { wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0); assert(GetId() != kMaxSequenceNumber); assert(GetId() > 0); + Status s; const auto& cf_map = *wupt_db_->GetCFHandleMap(); auto read_at_seq = kMaxSequenceNumber; - Status s; ReadOptions roptions; // Note that we do not use WriteUnpreparedTxnReadCallback because we do not // need to read our own writes when reading prior versions of the key for // rollback. + const auto& tracked_keys = GetTrackedKeys(); WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq); - for (const auto& cfkey : write_set_keys_) { + for (const auto& cfkey : tracked_keys) { const auto cfid = cfkey.first; const auto& keys = cfkey.second; - for (const auto& key : keys) { + for (const auto& pair : keys) { + const auto& key = pair.first; const auto& cf_handle = cf_map.at(cfid); PinnableSlice pinnable_val; bool not_used; @@ -426,7 +486,6 @@ Status WriteUnpreparedTxn::RollbackInternal() { wpt_db_->RemovePrepared(seq.first, seq.second); } unprep_seqs_.clear(); - write_set_keys_.clear(); return s; } // else do the 2nd write for commit uint64_t& prepare_seq = seq_used; @@ -453,10 +512,16 @@ Status WriteUnpreparedTxn::RollbackInternal() { } unprep_seqs_.clear(); - write_set_keys_.clear(); return s; } +void WriteUnpreparedTxn::Clear() { + if (!recovered_txn_) { + txn_db_impl_->UnLock(this, &GetTrackedKeys()); + } + TransactionBaseImpl::Clear(); +} + Status WriteUnpreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 751d36c23b9..15a76d13437 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -94,20 +94,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { const SliceParts& key, const bool assume_tracked = false) override; - virtual Status RebuildFromWriteBatch(WriteBatch*) override { - // This function was only useful for recovering prepared transactions, but - // is unused for write prepared because a transaction may consist of - // multiple write batches. - // - // If there are use cases outside of recovery that can make use of this, - // then support could be added. - return Status::NotSupported("Not supported for WriteUnprepared"); - } + virtual Status RebuildFromWriteBatch(WriteBatch*) override; const std::map& GetUnpreparedSequenceNumbers(); - void UpdateWriteKeySet(uint32_t cfid, const Slice& key); - protected: void Initialize(const TransactionOptions& txn_options) override; @@ -118,6 +108,8 @@ class WriteUnpreparedTxn : public WritePreparedTxn { Status RollbackInternal() override; + void Clear() override; + // Get and GetIterator needs to be overridden so that a ReadCallback to // handle read-your-own-write is used. using Transaction::Get; @@ -157,10 +149,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // commit callbacks. std::map unprep_seqs_; - // Set of keys that have written to that have already been written to DB - // (ie. not in write_batch_). - // - std::map> write_set_keys_; + // Recovered transactions have tracked_keys_ populated, but are not actually + // locked for efficiency reasons. For recovered transactions, skip unlocking + // keys when transaction ends. + bool recovered_txn_; }; } // namespace rocksdb diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 9382edfad2b..c4be058bb96 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -252,12 +252,13 @@ Status WriteUnpreparedTxnDB::Initialize( assert(real_trx); auto wupt = static_cast_with_check(real_trx); + wupt->recovered_txn_ = true; real_trx->SetLogNumber(first_log_number); real_trx->SetId(first_seq); Status s = real_trx->SetName(recovered_trx->name_); if (!s.ok()) { - break; + return s; } wupt->prepare_batch_cnt_ = last_prepare_batch_cnt; @@ -270,12 +271,11 @@ Status WriteUnpreparedTxnDB::Initialize( ordered_seq_cnt[seq] = cnt; assert(wupt->unprep_seqs_.count(seq) == 0); wupt->unprep_seqs_[seq] = cnt; - KeySetBuilder keyset_handler(wupt, - txn_db_options_.rollback_merge_operands); - s = batch_info.batch_->Iterate(&keyset_handler); + + s = wupt->RebuildFromWriteBatch(batch_info.batch_); assert(s.ok()); if (!s.ok()) { - break; + return s; } } @@ -284,7 +284,7 @@ Status WriteUnpreparedTxnDB::Initialize( real_trx->SetState(Transaction::PREPARED); if (!s.ok()) { - break; + return s; } } // AddPrepared must be called in order @@ -397,29 +397,5 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, return db_iter; } -Status KeySetBuilder::PutCF(uint32_t cf, const Slice& key, - const Slice& /*val*/) { - txn_->UpdateWriteKeySet(cf, key); - return Status::OK(); -} - -Status KeySetBuilder::DeleteCF(uint32_t cf, const Slice& key) { - txn_->UpdateWriteKeySet(cf, key); - return Status::OK(); -} - -Status KeySetBuilder::SingleDeleteCF(uint32_t cf, const Slice& key) { - txn_->UpdateWriteKeySet(cf, key); - return Status::OK(); -} - -Status KeySetBuilder::MergeCF(uint32_t cf, const Slice& key, - const Slice& /*val*/) { - if (rollback_merge_operands_) { - txn_->UpdateWriteKeySet(cf, key); - } - return Status::OK(); -} - } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h index 6405ba68381..65cb4b9195a 100644 --- a/utilities/transactions/write_unprepared_txn_db.h +++ b/utilities/transactions/write_unprepared_txn_db.h @@ -144,32 +144,5 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback { SequenceNumber rollback_seq_; }; -struct KeySetBuilder : public WriteBatch::Handler { - WriteUnpreparedTxn* txn_; - bool rollback_merge_operands_; - - KeySetBuilder(WriteUnpreparedTxn* txn, bool rollback_merge_operands) - : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} - - Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override; - - Status DeleteCF(uint32_t cf, const Slice& key) override; - - Status SingleDeleteCF(uint32_t cf, const Slice& key) override; - - Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override; - - // Recovered batches do not contain 2PC markers. - Status MarkNoop(bool) override { return Status::InvalidArgument(); } - Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } - Status MarkEndPrepare(const Slice&) override { - return Status::InvalidArgument(); - } - Status MarkCommit(const Slice&) override { return Status::InvalidArgument(); } - Status MarkRollback(const Slice&) override { - return Status::InvalidArgument(); - } -}; - } // namespace rocksdb #endif // ROCKSDB_LITE From 699a569c523c1d1083c2da79c5b42a3f70d74181 Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 16 Jul 2019 16:27:32 -0700 Subject: [PATCH 226/572] Remove RandomAccessFileReader.for_compaction_ (#5572) Summary: RandomAccessFileReader.for_compaction_ doesn't seem to be used anymore. Remove it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5572 Test Plan: USE_CLANG=1 make all check -j Differential Revision: D16286178 fbshipit-source-id: aa338049761033dfbe5e8b1707bbb0be2df5be7e --- db/table_cache.cc | 5 ++--- db/table_cache.h | 3 +-- db/version_set.cc | 3 +-- util/file_reader_writer.cc | 1 - util/file_reader_writer.h | 5 +---- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index b98d4b074ff..121d4941fc0 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -93,7 +93,7 @@ Status TableCache::GetTableReader( bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const SliceTransform* prefix_extractor, bool skip_filters, int level, - bool prefetch_index_and_filter_in_cache, bool for_compaction) { + bool prefetch_index_and_filter_in_cache) { std::string fname = TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; @@ -109,8 +109,7 @@ Status TableCache::GetTableReader( new RandomAccessFileReader( std::move(file), fname, ioptions_.env, record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter, for_compaction, - ioptions_.listeners)); + file_read_hist, ioptions_.rate_limiter, ioptions_.listeners)); s = ioptions_.table_factory->NewTableReader( TableReaderOptions(ioptions_, prefix_extractor, env_options, internal_comparator, skip_filters, immortal_tables_, diff --git a/db/table_cache.h b/db/table_cache.h index f274337e952..f9fd4815228 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -179,8 +179,7 @@ class TableCache { std::unique_ptr* table_reader, const SliceTransform* prefix_extractor = nullptr, bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true, - bool for_compaction = false); + bool prefetch_index_and_filter_in_cache = true); const ImmutableCFOptions& ioptions_; const EnvOptions& env_options_; diff --git a/db/version_set.cc b/db/version_set.cc index 32dd61db830..0d3b9fb4e32 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1231,8 +1231,7 @@ Status Version::GetTableProperties(std::shared_ptr* tp, new RandomAccessFileReader( std::move(file), file_name, nullptr /* env */, nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - nullptr /* rate_limiter */, false /* for_compaction*/, - ioptions->listeners)); + nullptr /* rate_limiter */, ioptions->listeners)); s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index db16e82ae11..15f41bf3a06 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -197,7 +197,6 @@ Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs, Status s; uint64_t elapsed = 0; assert(!use_direct_io()); - assert(!for_compaction_); { StopWatch sw(env_, stats_, hist_type_, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 0c5089d0758..3052ca8f4e0 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -108,7 +108,6 @@ class RandomAccessFileReader { uint32_t hist_type_; HistogramImpl* file_read_hist_; RateLimiter* rate_limiter_; - bool for_compaction_; std::vector> listeners_; public: @@ -116,7 +115,7 @@ class RandomAccessFileReader { std::unique_ptr&& raf, std::string _file_name, Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0, HistogramImpl* file_read_hist = nullptr, - RateLimiter* rate_limiter = nullptr, bool for_compaction = false, + RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}) : file_(std::move(raf)), file_name_(std::move(_file_name)), @@ -125,7 +124,6 @@ class RandomAccessFileReader { hist_type_(hist_type), file_read_hist_(file_read_hist), rate_limiter_(rate_limiter), - for_compaction_(for_compaction), listeners_() { #ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), @@ -151,7 +149,6 @@ class RandomAccessFileReader { hist_type_ = std::move(o.hist_type_); file_read_hist_ = std::move(o.file_read_hist_); rate_limiter_ = std::move(o.rate_limiter_); - for_compaction_ = std::move(o.for_compaction_); return *this; } From 0f4d90e6e4b3295cb5b6df6bbc36d2e2101b95f0 Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Tue, 16 Jul 2019 18:18:07 -0700 Subject: [PATCH 227/572] Added support for sequential read-ahead file (#5580) Summary: Added support for sequential read-ahead file that can prefetch the read data and later serve it from internal cache buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5580 Differential Revision: D16287082 Pulled By: elipoz fbshipit-source-id: a3e7ad9643d377d39352ff63058ce050ec31dcf3 --- test_util/testutil.h | 221 ++++++++++++++++---------------- util/file_reader_writer.cc | 214 +++++++++++++++++++++++++++---- util/file_reader_writer.h | 18 ++- util/file_reader_writer_test.cc | 119 ++++++++++++++++- 4 files changed, 429 insertions(+), 143 deletions(-) diff --git a/test_util/testutil.h b/test_util/testutil.h index bb732ff3a5a..716ae7d26e8 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -492,13 +492,11 @@ inline std::string EncodeInt(uint64_t x) { return result; } -class StringEnv : public EnvWrapper { - public: class SeqStringSource : public SequentialFile { public: explicit SeqStringSource(const std::string& data) : data_(data), offset_(0) {} - ~SeqStringSource() {} + ~SeqStringSource() override {} Status Read(size_t n, Slice* result, char* scratch) override { std::string output; if (offset_ < data_.size()) { @@ -527,129 +525,136 @@ class StringEnv : public EnvWrapper { size_t offset_; }; - class StringSink : public WritableFile { + class StringEnv : public EnvWrapper { public: - explicit StringSink(std::string* contents) - : WritableFile(), contents_(contents) {} - virtual Status Truncate(uint64_t size) override { - contents_->resize(static_cast(size)); - return Status::OK(); - } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { return Status::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { - contents_->append(slice.data(), slice.size()); - return Status::OK(); - } + class StringSink : public WritableFile { + public: + explicit StringSink(std::string* contents) + : WritableFile(), contents_(contents) {} + virtual Status Truncate(uint64_t size) override { + contents_->resize(static_cast(size)); + return Status::OK(); + } + virtual Status Close() override { return Status::OK(); } + virtual Status Flush() override { return Status::OK(); } + virtual Status Sync() override { return Status::OK(); } + virtual Status Append(const Slice& slice) override { + contents_->append(slice.data(), slice.size()); + return Status::OK(); + } - private: - std::string* contents_; - }; + private: + std::string* contents_; + }; - explicit StringEnv(Env* t) : EnvWrapper(t) {} - virtual ~StringEnv() {} + explicit StringEnv(Env* t) : EnvWrapper(t) {} + ~StringEnv() override {} - const std::string& GetContent(const std::string& f) { return files_[f]; } + const std::string& GetContent(const std::string& f) { return files_[f]; } - const Status WriteToNewFile(const std::string& file_name, - const std::string& content) { - std::unique_ptr r; - auto s = NewWritableFile(file_name, &r, EnvOptions()); - if (!s.ok()) { - return s; + const Status WriteToNewFile(const std::string& file_name, + const std::string& content) { + std::unique_ptr r; + auto s = NewWritableFile(file_name, &r, EnvOptions()); + if (!s.ok()) { + return s; + } + r->Append(content); + r->Flush(); + r->Close(); + assert(files_[file_name] == content); + return Status::OK(); } - r->Append(content); - r->Flush(); - r->Close(); - assert(files_[file_name] == content); - return Status::OK(); - } - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, - std::unique_ptr* r, + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& /*options*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist", f); + } + r->reset(new SeqStringSource(iter->second)); + return Status::OK(); + } + Status NewRandomAccessFile(const std::string& /*f*/, + std::unique_ptr* /*r*/, + const EnvOptions& /*options*/) override { + return Status::NotSupported(); + } + Status NewWritableFile(const std::string& f, + std::unique_ptr* r, const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist", f); + auto iter = files_.find(f); + if (iter != files_.end()) { + return Status::IOError("The specified file already exists", f); + } + r->reset(new StringSink(&files_[f])); + return Status::OK(); } - r->reset(new SeqStringSource(iter->second)); - return Status::OK(); - } - Status NewRandomAccessFile(const std::string& /*f*/, - std::unique_ptr* /*r*/, - const EnvOptions& /*options*/) override { - return Status::NotSupported(); - } - Status NewWritableFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter != files_.end()) { - return Status::IOError("The specified file already exists", f); + virtual Status NewDirectory( + const std::string& /*name*/, + std::unique_ptr* /*result*/) override { + return Status::NotSupported(); } - r->reset(new StringSink(&files_[f])); - return Status::OK(); - } - virtual Status NewDirectory(const std::string& /*name*/, - std::unique_ptr* /*result*/) override { - return Status::NotSupported(); - } - Status FileExists(const std::string& f) override { - if (files_.find(f) == files_.end()) { - return Status::NotFound(); + Status FileExists(const std::string& f) override { + if (files_.find(f) == files_.end()) { + return Status::NotFound(); + } + return Status::OK(); } - return Status::OK(); - } - Status GetChildren(const std::string& /*dir*/, - std::vector* /*r*/) override { - return Status::NotSupported(); - } - Status DeleteFile(const std::string& f) override { - files_.erase(f); - return Status::OK(); - } - Status CreateDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status CreateDirIfMissing(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status DeleteDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status GetFileSize(const std::string& f, uint64_t* s) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist:", f); + Status GetChildren(const std::string& /*dir*/, + std::vector* /*r*/) override { + return Status::NotSupported(); + } + Status DeleteFile(const std::string& f) override { + files_.erase(f); + return Status::OK(); + } + Status CreateDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status CreateDirIfMissing(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status DeleteDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return Status::OK(); } - *s = iter->second.size(); - return Status::OK(); - } - Status GetFileModificationTime(const std::string& /*fname*/, - uint64_t* /*file_mtime*/) override { - return Status::NotSupported(); - } + Status GetFileModificationTime(const std::string& /*fname*/, + uint64_t* /*file_mtime*/) override { + return Status::NotSupported(); + } - Status RenameFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } + Status RenameFile(const std::string& /*s*/, + const std::string& /*t*/) override { + return Status::NotSupported(); + } - Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override { - return Status::NotSupported(); - } + Status LinkFile(const std::string& /*s*/, + const std::string& /*t*/) override { + return Status::NotSupported(); + } - Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { - return Status::NotSupported(); - } + Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { + return Status::NotSupported(); + } - Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); } + Status UnlockFile(FileLock* /*l*/) override { + return Status::NotSupported(); + } - protected: - std::unordered_map files_; -}; + protected: + std::unordered_map files_; + }; // Randomly initialize the given DBOptions void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 15f41bf3a06..9175fa502f9 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -639,6 +639,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { + // Read-ahead only make sense if we have some slack left after reading if (n + alignment_ >= readahead_size_) { return file_->Read(offset, n, result, scratch); } @@ -646,14 +647,13 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { std::unique_lock lk(lock_); size_t cached_len = 0; - // Check if there is a cache hit, means that [offset, offset + n) is either - // completely or partially in the buffer + // Check if there is a cache hit, meaning that [offset, offset + n) is either + // completely or partially in the buffer. // If it's completely cached, including end of file case when offset + n is - // greater than EOF, return + // greater than EOF, then return. if (TryReadFromCache(offset, n, &cached_len, scratch) && - (cached_len == n || - // End of file - buffer_.CurrentSize() < readahead_size_)) { + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. *result = Slice(scratch, cached_len); return Status::OK(); } @@ -661,25 +661,14 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // In the case of cache hit advanced_offset is already aligned, means that // chunk_offset equals to advanced_offset size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); - Slice readahead_result; Status s = ReadIntoBuffer(chunk_offset, readahead_size_); if (s.ok()) { - // In the case of cache miss, i.e. when cached_len equals 0, an offset can - // exceed the file end position, so the following check is required - if (advanced_offset < chunk_offset + buffer_.CurrentSize()) { - // In the case of cache miss, the first chunk_padding bytes in buffer_ - // are - // stored for alignment only and must be skipped - size_t chunk_padding = advanced_offset - chunk_offset; - auto remaining_len = - std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len); - memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding, - remaining_len); - *result = Slice(scratch, cached_len + remaining_len); - } else { - *result = Slice(scratch, cached_len); - } + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(advanced_offset, n - cached_len, &remaining_len, + scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); } return s; } @@ -690,6 +679,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. return Status::OK(); } + + std::unique_lock lk(lock_); + size_t offset_ = static_cast(offset); size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); if (prefetch_offset == buffer_offset_) { @@ -706,12 +698,18 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { void Hint(AccessPattern pattern) override { file_->Hint(pattern); } Status InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); return file_->InvalidateCache(offset, length); } bool use_direct_io() const override { return file_->use_direct_io(); } private: + // Tries to read from buffer_ n bytes starting at offset. If anything was read + // from the cache, it sets cached_len to the number of bytes actually read, + // copies these number of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len, char* scratch) const { if (offset < buffer_offset_ || @@ -726,6 +724,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { return true; } + // Reads into buffer_ the next n bytes from file_ starting at offset. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. Status ReadIntoBuffer(uint64_t offset, size_t n) const { if (n > buffer_.Capacity()) { n = buffer_.Capacity(); @@ -742,14 +743,171 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { return s; } - std::unique_ptr file_; + const std::unique_ptr file_; const size_t alignment_; - size_t readahead_size_; + const size_t readahead_size_; mutable std::mutex lock_; + // The buffer storing the prefetched data mutable AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ mutable uint64_t buffer_offset_; }; + +// This class wraps a SequentialFile, exposing same API, with the differenece +// of being able to prefetch up to readahead_size bytes and then serve them +// from memory, avoiding the entire round-trip if, for example, the data for the +// file is actually remote. +class ReadaheadSequentialFile : public SequentialFile { + public: + ReadaheadSequentialFile(std::unique_ptr&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0), + read_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete; + + ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete; + + Status Read(size_t n, Slice* result, char* scratch) override { + std::unique_lock lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, meaning that [offset, offset + n) is + // either completely or partially in the buffer. If it's completely cached, + // including end of file case when offset + n is greater than EOF, then + // return. + if (TryReadFromCache(n, &cached_len, scratch) && + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. + *result = Slice(scratch, cached_len); + return Status::OK(); + } + n -= cached_len; + + Status s; + // Read-ahead only make sense if we have some slack left after reading + if (n + alignment_ >= readahead_size_) { + s = file_->Read(n, result, scratch + cached_len); + if (s.ok()) { + read_offset_ += result->size(); + *result = Slice(scratch, cached_len + result->size()); + } + buffer_.Clear(); + return s; + } + + s = ReadIntoBuffer(readahead_size_); + if (s.ok()) { + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(n, &remaining_len, scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); + } + return s; + } + + Status Skip(uint64_t n) override { + std::unique_lock lk(lock_); + Status s = Status::OK(); + // First check if we need to skip already cached data + if (buffer_.CurrentSize() > 0) { + // Do we need to skip beyond cached data? + if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) { + // Yes. Skip whaterver is in memory and adjust offset accordingly + n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_; + read_offset_ = buffer_offset_ + buffer_.CurrentSize(); + } else { + // No. The entire section to be skipped is entirely i cache. + read_offset_ += n; + n = 0; + } + } + if (n > 0) { + // We still need to skip more, so call the file API for skipping + s = file_->Skip(n); + if (s.ok()) { + read_offset_ += n; + } + buffer_.Clear(); + } + return s; + } + + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + return file_->PositionedRead(offset, n, result, scratch); + } + + Status InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + + private: + // Tries to read from buffer_ n bytes. If anything was read from the cache, it + // sets cached_len to the number of bytes actually read, copies these number + // of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. + bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) { + if (read_offset_ < buffer_offset_ || + read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = read_offset_ - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + read_offset_ += *cached_len; + return true; + } + + // Reads into buffer_ the next n bytes from file_. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. + Status ReadIntoBuffer(size_t n) { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + Status s = file_->Read(n, &result, buffer_.BufferStart()); + if (s.ok()) { + buffer_offset_ = read_offset_; + buffer_.Size(result.size()); + assert(buffer_.BufferStart() == result.data()); + } + return s; + } + + const std::unique_ptr file_; + const size_t alignment_; + const size_t readahead_size_; + + std::mutex lock_; + // The buffer storing the prefetched data + AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ + uint64_t buffer_offset_; + // The offset up to which data was read from file_. In fact, it can be larger + // than the actual file size, since the file_->Skip(n) call doesn't return the + // actual number of bytes that were skipped, which can be less than n. + // This is not a problemm since read_offset_ is monotonically increasing and + // its only use is to figure out if next piece of data should be read from + // buffer_ or file_ directly. + uint64_t read_offset_; +}; } // namespace Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, @@ -866,6 +1024,14 @@ std::unique_ptr NewReadaheadRandomAccessFile( return result; } +std::unique_ptr +SequentialFileReader::NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size) { + std::unique_ptr result( + new ReadaheadSequentialFile(std::move(file), readahead_size)); + return result; +} + Status NewWritableFile(Env* env, const std::string& fname, std::unique_ptr* result, const EnvOptions& options) { diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 3052ca8f4e0..a93274644c4 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -43,12 +43,18 @@ class SequentialFileReader { private: std::unique_ptr file_; std::string file_name_; - std::atomic offset_; // read offset + std::atomic offset_{0}; // read offset public: explicit SequentialFileReader(std::unique_ptr&& _file, const std::string& _file_name) - : file_(std::move(_file)), file_name_(_file_name), offset_(0) {} + : file_(std::move(_file)), file_name_(_file_name) {} + + explicit SequentialFileReader(std::unique_ptr&& _file, + const std::string& _file_name, + size_t _readahead_size) + : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)), + file_name_(_file_name) {} SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { *this = std::move(o); @@ -66,13 +72,17 @@ class SequentialFileReader { Status Skip(uint64_t n); - void Rewind(); - SequentialFile* file() { return file_.get(); } std::string file_name() { return file_name_; } bool use_direct_io() const { return file_->use_direct_io(); } + + private: + // NewReadaheadSequentialFile provides a wrapper over SequentialFile to + // always prefetch additional data with every read. + static std::unique_ptr NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size); }; // RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index a4a9458d642..aa74303b8fc 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -275,7 +275,7 @@ TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSizeTest) { } TEST_P(ReadaheadRandomAccessFileTest, - SourceStrLenCanBeGreaterThanReadaheadSizeTest) { + SourceStrLenGreaterThanReadaheadSizeTest) { Random rng(42); for (int k = 0; k < 100; ++k) { size_t strLen = k * GetReadaheadSize() + @@ -286,13 +286,13 @@ TEST_P(ReadaheadRandomAccessFileTest, for (int test = 1; test <= 100; ++test) { size_t offset = rng.Uniform(static_cast(strLen)); size_t n = rng.Uniform(static_cast(GetReadaheadSize())); - ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)), + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(offset, n)); } } } -TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) { +TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSizeTest) { Random rng(7); size_t strLen = 4 * GetReadaheadSize() + rng.Uniform(static_cast(GetReadaheadSize())); @@ -303,7 +303,7 @@ TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) { size_t offset = rng.Uniform(static_cast(strLen)); size_t n = GetReadaheadSize() + rng.Uniform(static_cast(GetReadaheadSize())); - ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)), + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(offset, n)); } } @@ -315,13 +315,118 @@ INSTANTIATE_TEST_CASE_P( SourceStrLenLessThanReadaheadSizeTest, ReadaheadRandomAccessFileTest, ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); INSTANTIATE_TEST_CASE_P( - SourceStrLenCanBeGreaterThanReadaheadSizeTest, - ReadaheadRandomAccessFileTest, + SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadRandomAccessFileTest, ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); INSTANTIATE_TEST_CASE_P( - NExceedReadaheadTest, ReadaheadRandomAccessFileTest, + ReadExceedsReadaheadSizeTest, ReadaheadRandomAccessFileTest, ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +class ReadaheadSequentialFileTest : public testing::Test, + public testing::WithParamInterface { + public: + static std::vector GetReadaheadSizeList() { + return {1lu << 12, 1lu << 16}; + } + void SetUp() override { + readahead_size_ = GetParam(); + scratch_.reset(new char[2 * readahead_size_]); + ResetSourceStr(); + } + ReadaheadSequentialFileTest() {} + std::string Read(size_t n) { + Slice result; + test_read_holder_->Read(n, &result, scratch_.get()); + return std::string(result.data(), result.size()); + } + void Skip(size_t n) { test_read_holder_->Skip(n); } + void ResetSourceStr(const std::string& str = "") { + auto read_holder = + std::unique_ptr(new test::SeqStringSource(str)); + test_read_holder_.reset(new SequentialFileReader(std::move(read_holder), + "test", readahead_size_)); + } + size_t GetReadaheadSize() const { return readahead_size_; } + + private: + size_t readahead_size_; + std::unique_ptr test_read_holder_; + std::unique_ptr scratch_; +}; + +TEST_P(ReadaheadSequentialFileTest, EmptySourceStrTest) { + ASSERT_EQ("", Read(0)); + ASSERT_EQ("", Read(1)); + ASSERT_EQ("", Read(13)); +} + +TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSizeTest) { + std::string str = "abcdefghijklmnopqrs"; + ResetSourceStr(str); + ASSERT_EQ(str.substr(0, 3), Read(3)); + ASSERT_EQ(str.substr(3, 1), Read(1)); + ASSERT_EQ(str.substr(4), Read(str.size())); + ASSERT_EQ("", Read(100)); +} + +TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSizeTest) { + Random rng(42); + for (int s = 0; s < 1; ++s) { + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = + test::RandomHumanReadableString(&rng, static_cast(strLen)); + ResetSourceStr(str); + size_t offset = 0; + for (int test = 1; test <= 100; ++test) { + size_t n = rng.Uniform(static_cast(GetReadaheadSize())); + if (s && test % 2) { + Skip(n); + } else { + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n)); + } + offset = std::min(offset + n, strLen); + } + } + } +} + +TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSizeTest) { + Random rng(42); + for (int s = 0; s < 1; ++s) { + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = + test::RandomHumanReadableString(&rng, static_cast(strLen)); + ResetSourceStr(str); + size_t offset = 0; + for (int test = 1; test <= 100; ++test) { + size_t n = GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + if (s && test % 2) { + Skip(n); + } else { + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n)); + } + offset = std::min(offset + n, strLen); + } + } + } +} + +INSTANTIATE_TEST_CASE_P( + EmptySourceStrTest, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenLessThanReadaheadSizeTest, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + ReadExceedsReadaheadSizeTest, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); } // namespace rocksdb int main(int argc, char** argv) { From 74fb7f0ba53ecce443ff0a619199c0e2cb74ab35 Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Tue, 16 Jul 2019 19:13:35 -0700 Subject: [PATCH 228/572] Cleaned up and simplified LRU cache implementation (#5579) Summary: The 'refs' field in LRUHandle now counts only external references, since anyway we already have the IN_CACHE flag. This simplifies reference accounting logic a bit. Also cleaned up few asserts code as well as the comments - to be more readable. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5579 Differential Revision: D16286747 Pulled By: elipoz fbshipit-source-id: 7186d88f80f512ce584d0a303437494b5cbefd7f --- .gitignore | 1 + cache/cache_test.cc | 4 +- cache/lru_cache.cc | 124 ++++++++++++++++++++------------------------ cache/lru_cache.h | 78 +++++++++++++++------------- 4 files changed, 100 insertions(+), 107 deletions(-) diff --git a/.gitignore b/.gitignore index 6364dfdc401..180fb4c5007 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ ldb manifest_dump sst_dump blob_dump +block_cache_trace_analyzer column_aware_encoding_exp util/build_version.cc build_tools/VALGRIND_LOGS/ diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 46ce78db68f..b728c67c7d7 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -562,6 +562,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { ASSERT_OK(s); ASSERT_NE(nullptr, handles[i]); } + ASSERT_EQ(10, cache->GetUsage()); // test2: set the flag to true. Insert and check if it fails. std::string extra_key = "extra"; @@ -571,6 +572,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle); ASSERT_TRUE(s.IsIncomplete()); ASSERT_EQ(nullptr, handle); + ASSERT_EQ(10, cache->GetUsage()); for (size_t i = 0; i < 10; i++) { cache->Release(handles[i]); @@ -591,7 +593,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { s = cache2->Insert(extra_key, extra_value, 1, &deleter); // AS if the key have been inserted into cache but get evicted immediately. ASSERT_OK(s); - ASSERT_EQ(5, cache->GetUsage()); + ASSERT_EQ(5, cache2->GetUsage()); ASSERT_EQ(nullptr, cache2->Lookup(extra_key)); for (size_t i = 0; i < 5; i++) { diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 676bed3051c..7c04cb909d5 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -24,7 +24,7 @@ LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) { LRUHandleTable::~LRUHandleTable() { ApplyToAllCacheEntries([](LRUHandle* h) { - if (h->refs == 1) { + if (!h->HasRefs()) { h->Free(); } }); @@ -113,29 +113,17 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, SetCapacity(capacity); } -LRUCacheShard::~LRUCacheShard() {} - -bool LRUCacheShard::Unref(LRUHandle* e) { - assert(e->refs > 0); - e->refs--; - return e->refs == 0; -} - -// Call deleter and free - void LRUCacheShard::EraseUnRefEntries() { autovector last_reference_list; { MutexLock l(&mutex_); while (lru_.next != &lru_) { LRUHandle* old = lru_.next; - assert(old->InCache()); - assert(old->refs == - 1); // LRU list contains elements which may be evicted + // LRU list contains only elements which can be evicted + assert(old->InCache() && !old->HasRefs()); LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - Unref(old); usage_ -= old->charge; last_reference_list.push_back(old); } @@ -148,22 +136,27 @@ void LRUCacheShard::EraseUnRefEntries() { void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) { + const auto applyCallback = [&]() { + table_.ApplyToAllCacheEntries( + [callback](LRUHandle* h) { callback(h->value, h->charge); }); + }; + if (thread_safe) { - mutex_.Lock(); - } - table_.ApplyToAllCacheEntries( - [callback](LRUHandle* h) { callback(h->value, h->charge); }); - if (thread_safe) { - mutex_.Unlock(); + MutexLock l(&mutex_); + applyCallback(); + } else { + applyCallback(); } } void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) { + MutexLock l(&mutex_); *lru = &lru_; *lru_low_pri = lru_low_pri_; } size_t LRUCacheShard::TEST_GetLRUSize() { + MutexLock l(&mutex_); LRUHandle* lru_handle = lru_.next; size_t lru_size = 0; while (lru_handle != &lru_) { @@ -231,14 +224,13 @@ void LRUCacheShard::MaintainPoolSize() { void LRUCacheShard::EvictFromLRU(size_t charge, autovector* deleted) { - while (usage_ + charge > capacity_ && lru_.next != &lru_) { + while ((usage_ + charge) > capacity_ && lru_.next != &lru_) { LRUHandle* old = lru_.next; - assert(old->InCache()); - assert(old->refs == 1); // LRU list contains elements which may be evicted + // LRU list contains only elements which can be evicted + assert(old->InCache() && !old->HasRefs()); LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - Unref(old); usage_ -= old->charge; deleted->push_back(old); } @@ -252,8 +244,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) { high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_; EvictFromLRU(0, &last_reference_list); } - // we free the entries here outside of mutex for - // performance reasons + + // Free the entries outside of mutex for performance reasons for (auto entry : last_reference_list) { entry->Free(); } @@ -269,22 +261,22 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { LRUHandle* e = table_.Lookup(key, hash); if (e != nullptr) { assert(e->InCache()); - if (e->refs == 1) { + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references LRU_Remove(e); } - e->refs++; + e->Ref(); e->SetHit(); } return reinterpret_cast(e); } bool LRUCacheShard::Ref(Cache::Handle* h) { - LRUHandle* handle = reinterpret_cast(h); + LRUHandle* e = reinterpret_cast(h); MutexLock l(&mutex_); - if (handle->InCache() && handle->refs == 1) { - LRU_Remove(handle); - } - handle->refs++; + // To create another reference - entry must be already externally referenced + assert(e->HasRefs()); + e->Ref(); return true; } @@ -303,30 +295,27 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) { bool last_reference = false; { MutexLock l(&mutex_); - last_reference = Unref(e); - if (last_reference) { - usage_ -= e->charge; - } - if (e->refs == 1 && e->InCache()) { + last_reference = e->Unref(); + if (last_reference && e->InCache()) { // The item is still in cache, and nobody else holds a reference to it if (usage_ > capacity_ || force_erase) { - // the cache is full // The LRU list must be empty since the cache is full - assert(!(usage_ > capacity_) || lru_.next == &lru_); - // take this opportunity and remove the item + assert(lru_.next == &lru_ || force_erase); + // Take this opportunity and remove the item table_.Remove(e->key(), e->hash); e->SetInCache(false); - Unref(e); - usage_ -= e->charge; - last_reference = true; } else { - // put the item on the list to be potentially freed + // Put the item back on the LRU list, and don't free it LRU_Insert(e); + last_reference = false; } } + if (last_reference) { + usage_ -= e->charge; + } } - // free outside of mutex + // Free the entry here outside of mutex for performance reasons if (last_reference) { e->Free(); } @@ -342,7 +331,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, // It shouldn't happen very often though. LRUHandle* e = reinterpret_cast( new char[sizeof(LRUHandle) - 1 + key.size()]); - Status s; + Status s = Status::OK(); autovector last_reference_list; e->value = value; @@ -351,9 +340,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, e->key_length = key.size(); e->flags = 0; e->hash = hash; - e->refs = (handle == nullptr - ? 1 - : 2); // One from LRUCache, one for the returned handle + e->refs = 0; e->next = e->prev = nullptr; e->SetInCache(true); e->SetPriority(priority); @@ -366,11 +353,12 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, // is freed or the lru list is empty EvictFromLRU(charge, &last_reference_list); - if (usage_ - lru_usage_ + charge > capacity_ && + if ((usage_ + charge) > capacity_ && (strict_capacity_limit_ || handle == nullptr)) { if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry inserted // into cache and get evicted immediately. + e->SetInCache(false); last_reference_list.push_back(e); } else { delete[] reinterpret_cast(e); @@ -378,32 +366,30 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, s = Status::Incomplete("Insert failed due to LRU cache being full."); } } else { - // insert into the cache - // note that the cache might get larger than its capacity if not enough - // space was freed + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. LRUHandle* old = table_.Insert(e); usage_ += e->charge; if (old != nullptr) { + assert(old->InCache()); old->SetInCache(false); - if (Unref(old)) { - usage_ -= old->charge; - // old is on LRU because it's in cache and its reference count - // was just 1 (Unref returned 0) + if (!old->HasRefs()) { + // old is on LRU because it's in cache and its reference count is 0 LRU_Remove(old); + usage_ -= old->charge; last_reference_list.push_back(old); } } if (handle == nullptr) { LRU_Insert(e); } else { + e->Ref(); *handle = reinterpret_cast(e); } - s = Status::OK(); } } - // we free the entries here outside of mutex for - // performance reasons + // Free the entries here outside of mutex for performance reasons for (auto entry : last_reference_list) { entry->Free(); } @@ -418,18 +404,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { MutexLock l(&mutex_); e = table_.Remove(key, hash); if (e != nullptr) { - last_reference = Unref(e); - if (last_reference) { - usage_ -= e->charge; - } - if (last_reference && e->InCache()) { + assert(e->InCache()); + e->SetInCache(false); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references LRU_Remove(e); + usage_ -= e->charge; + last_reference = true; } - e->SetInCache(false); } } - // mutex not held here + // Free the entry here outside of mutex for performance reasons // last_reference will only be true if e != nullptr if (last_reference) { e->Free(); diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 0d9a317486e..1ff765d1592 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -17,31 +17,34 @@ namespace rocksdb { -// LRU cache implementation +// LRU cache implementation. This class is not thread-safe. // An entry is a variable length heap-allocated structure. // Entries are referenced by cache and/or by any external entity. -// The cache keeps all its entries in table. Some elements +// The cache keeps all its entries in a hash table. Some elements // are also stored on LRU list. // // LRUHandle can be in these states: // 1. Referenced externally AND in hash table. -// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) -// 2. Not referenced externally and in hash table. In that case the entry is -// in the LRU and can be freed. (refs == 1 && in_cache == true) -// 3. Referenced externally and not in hash table. In that case the entry is -// in not on LRU and not in table. (refs >= 1 && in_cache == false) +// In that case the entry is *not* in the LRU list +// (refs >= 1 && in_cache == true) +// 2. Not referenced externally AND in hash table. +// In that case the entry is in the LRU list and can be freed. +// (refs == 0 && in_cache == true) +// 3. Referenced externally AND not in hash table. +// In that case the entry is not in the LRU list and not in hash table. +// The entry can be freed when refs becomes 0. +// (refs >= 1 && in_cache == false) // // All newly created LRUHandles are in state 1. If you call -// LRUCacheShard::Release -// on entry in state 1, it will go into state 2. To move from state 1 to -// state 3, either call LRUCacheShard::Erase or LRUCacheShard::Insert with the -// same key. +// LRUCacheShard::Release on entry in state 1, it will go into state 2. +// To move from state 1 to state 3, either call LRUCacheShard::Erase or +// LRUCacheShard::Insert with the same key (but possibly different value). // To move from state 2 to state 1, use LRUCacheShard::Lookup. // Before destruction, make sure that no handles are in state 1. This means // that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a -// matching -// RUCache::Release (to move into state 2) or LRUCacheShard::Erase (for state 3) +// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase +// (to move into state 3). struct LRUHandle { void* value; @@ -51,37 +54,42 @@ struct LRUHandle { LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? size_t key_length; - uint32_t refs; // a number of refs to this entry - // cache itself is counted as 1 - - // Include the following flags: - // IN_CACHE: whether this entry is referenced by the hash table. - // IS_HIGH_PRI: whether this entry is high priority entry. - // IN_HIGH_PRI_POOL: whether this entry is in high-pri pool. - // HAS_HIT: whether this entry has had any lookups (hits). + // The hash of key(). Used for fast sharding and comparisons. + uint32_t hash; + // The number of external refs to this entry. The cache itself is not counted. + uint32_t refs; + enum Flags : uint8_t { + // Whether this entry is referenced by the hash table. IN_CACHE = (1 << 0), + // Whether this entry is high priority entry. IS_HIGH_PRI = (1 << 1), + // Whether this entry is in high-pri pool. IN_HIGH_PRI_POOL = (1 << 2), + // Wwhether this entry has had any lookups (hits). HAS_HIT = (1 << 3), }; uint8_t flags; - uint32_t hash; // Hash of key(); used for fast sharding and comparisons + // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) + char key_data[1]; - char key_data[1]; // Beginning of key + Slice key() const { return Slice(key_data, key_length); } - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } + // Increase the reference count by 1. + void Ref() { refs++; } + + // Just reduce the reference count by 1. Return true if it was last reference. + bool Unref() { + assert(refs > 0); + refs--; + return refs == 0; } + // Return true if there are external refs, false otherwise. + bool HasRefs() const { return refs > 0; } + bool InCache() const { return flags & IN_CACHE; } bool IsHighPri() const { return flags & IS_HIGH_PRI; } bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; } @@ -114,7 +122,7 @@ struct LRUHandle { void SetHit() { flags |= HAS_HIT; } void Free() { - assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); + assert(refs == 0); if (deleter) { (*deleter)(key(), value); } @@ -169,7 +177,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { public: LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, bool use_adaptive_mutex); - virtual ~LRUCacheShard(); + virtual ~LRUCacheShard() override = default; // Separate from constructor so caller can easily make an array of LRUCache // if current usage is more than new capacity, the function will attempt to @@ -225,10 +233,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // high-pri pool is no larger than the size specify by high_pri_pool_pct. void MaintainPoolSize(); - // Just reduce the reference count by 1. - // Return true if last reference - bool Unref(LRUHandle* e); - // Free some space following strict LRU policy until enough space // to hold (usage_ + charge) is freed or the lru list is empty // This function is not thread safe - it needs to be executed while From a3c1832e862ab4b76ccf1299d6e95b15eb50730e Mon Sep 17 00:00:00 2001 From: Yuqi Gu Date: Wed, 17 Jul 2019 11:19:06 -0700 Subject: [PATCH 229/572] Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 --- Makefile | 4 +-- util/crc32c_arm64.cc | 83 ++++++++++++++++++++++++++++++++++---------- util/crc32c_arm64.h | 15 ++++++-- 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 1828b833b02..100f160ca31 100644 --- a/Makefile +++ b/Makefile @@ -144,8 +144,8 @@ HAVE_POWER8=1 endif ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1)) -CXXFLAGS += -march=armv8-a+crc -CFLAGS += -march=armv8-a+crc +CXXFLAGS += -march=armv8-a+crc+crypto +CFLAGS += -march=armv8-a+crc+crypto ARMCRC_SOURCE=1 endif diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc index 62fabe99e3c..8743f8c721c 100644 --- a/util/crc32c_arm64.cc +++ b/util/crc32c_arm64.cc @@ -19,35 +19,82 @@ uint32_t crc32c_runtime_check(void) { uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len) { - const uint8_t *buf1; - const uint16_t *buf2; - const uint32_t *buf4; - const uint64_t *buf8; + const uint8_t *buf8; + const uint64_t *buf64 = (uint64_t *)data; + int length = (int)len; + crc ^= 0xffffffff; - int64_t length = (int64_t)len; +#ifdef HAVE_ARM64_CRYPTO + /* Crc32c Parallel computation + * Algorithm comes from Intel whitepaper: + * crc-iscsi-polynomial-crc32-instruction-paper + * + * Input data is divided into three equal-sized blocks + * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes + * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes + */ + #define BLK_LENGTH 42 + while (length >= 1024) { + uint64_t t0, t1; + uint32_t crc0 = 0, crc1 = 0, crc2 = 0; - crc ^= 0xffffffff; - buf8 = (const uint64_t *)data; - while ((length -= sizeof(uint64_t)) >= 0) { - crc = __crc32cd(crc, *buf8++); + /* Parallel Param: + * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1)); + * k1 = CRC32(x ^ (42 * 8 * 8 - 1)); + */ + uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; + + /* First 8 bytei for better pipelining */ + crc0 = crc32c_u64(crc, *buf64++); + + /* 3 blocks crc32c parallel computation + * + * 42 * 8 * 3 = 1008 (bytes) + */ + for (int i = 0; i < BLK_LENGTH; i++, buf64++) { + crc0 = crc32c_u64(crc0, *buf64); + crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH)); + crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2))); + } + buf64 += (BLK_LENGTH * 2); + + /* Last 8 bytes */ + crc = crc32c_u64(crc2, *buf64++); + + t0 = (uint64_t)vmull_p64(crc0, k0); + t1 = (uint64_t)vmull_p64(crc1, k1); + + /* Merge (crc0, crc1, crc2) -> crc */ + crc1 = crc32c_u64(0, t1); + crc ^= crc1; + crc0 = crc32c_u64(0, t0); + crc ^= crc0; + + length -= 1024; + } +#endif + buf8 = (const uint8_t *)buf64; + while (length >= 8) { + crc = crc32c_u64(crc, *(const uint64_t*)buf8); + buf8 += 8; + length -= 8; } /* The following is more efficient than the straight loop */ - buf4 = (const uint32_t *)buf8; - if (length & sizeof(uint32_t)) { - crc = __crc32cw(crc, *buf4++); + if (length >= 4) { + crc = crc32c_u32(crc, *(const uint32_t*)buf8); + buf8 += 4; length -= 4; } - buf2 = (const uint16_t *)buf4; - if (length & sizeof(uint16_t)) { - crc = __crc32ch(crc, *buf2++); + if (length >= 2) { + crc = crc32c_u16(crc, *(const uint16_t*)buf8); + buf8 += 2; length -= 2; } - buf1 = (const uint8_t *)buf2; - if (length & sizeof(uint8_t)) - crc = __crc32cb(crc, *buf1); + if (length >= 1) + crc = crc32c_u8(crc, *buf8); crc ^= 0xffffffff; return crc; diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index 80b3aca361a..fb727ce4020 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -9,13 +9,24 @@ #include #if defined(__aarch64__) || defined(__AARCH64__) + #ifdef __ARM_FEATURE_CRC32 #define HAVE_ARM64_CRC #include +#define crc32c_u8(crc, v) __crc32cb(crc, v) +#define crc32c_u16(crc, v) __crc32ch(crc, v) +#define crc32c_u32(crc, v) __crc32cw(crc, v) +#define crc32c_u64(crc, v) __crc32cd(crc, v) + extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len); extern uint32_t crc32c_runtime_check(void); -#endif -#endif +#ifdef __ARM_FEATURE_CRYPTO +#define HAVE_ARM64_CRYPTO +#include +#endif // __ARM_FEATURE_CRYPTO +#endif // __ARM_FEATURE_CRC32 + +#endif // defined(__aarch64__) || defined(__AARCH64__) #endif From 22ce4624509694b8c35a15ef1fc49d3013f05a96 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Wed, 17 Jul 2019 12:22:21 -0700 Subject: [PATCH 230/572] Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9 --- CMakeLists.txt | 1 + Makefile | 5 + TARGETS | 1 + db/compacted_db_impl.h | 9 + db/db_impl/db_impl.cc | 122 +++++ db/db_impl/db_impl.h | 11 +- db/db_impl/db_impl_readonly.h | 10 + db/db_test.cc | 10 + db/import_column_family_job.cc | 257 +++++++++++ db/import_column_family_job.h | 70 +++ db/import_column_family_test.cc | 565 +++++++++++++++++++++++ include/rocksdb/db.h | 21 + include/rocksdb/metadata.h | 7 + include/rocksdb/options.h | 6 + include/rocksdb/utilities/checkpoint.h | 14 + include/rocksdb/utilities/stackable_db.h | 10 + src.mk | 1 + utilities/checkpoint/checkpoint_impl.cc | 185 ++++++++ utilities/checkpoint/checkpoint_impl.h | 23 + utilities/checkpoint/checkpoint_test.cc | 126 +++++ 20 files changed, 1453 insertions(+), 1 deletion(-) create mode 100644 db/import_column_family_job.cc create mode 100644 db/import_column_family_job.h create mode 100644 db/import_column_family_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 65904b8cae6..b49a13572bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -520,6 +520,7 @@ set(SOURCES db/flush_job.cc db/flush_scheduler.cc db/forward_iterator.cc + db/import_column_family_job.cc db/internal_stats.cc db/logs_with_prep_tracker.cc db/log_reader.cc diff --git a/Makefile b/Makefile index 100f160ca31..f8a904bd39d 100644 --- a/Makefile +++ b/Makefile @@ -500,6 +500,7 @@ TESTS = \ plain_table_db_test \ comparator_db_test \ external_sst_file_test \ + import_column_family_test \ prefix_test \ skiplist_test \ write_buffer_manager_test \ @@ -577,6 +578,7 @@ PARALLEL_TEST = \ db_universal_compaction_test \ db_wal_test \ external_sst_file_test \ + import_column_family_test \ fault_injection_test \ inlineskiplist_test \ manual_compaction_test \ @@ -1274,6 +1276,9 @@ external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util. external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index eda1051396d..cfd9ef73d40 100644 --- a/TARGETS +++ b/TARGETS @@ -113,6 +113,7 @@ cpp_library( "db/flush_job.cc", "db/flush_scheduler.cc", "db/forward_iterator.cc", + "db/import_column_family_job.cc", "db/internal_stats.cc", "db/log_reader.cc", "db/log_writer.cc", diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index c1b8da9a782..e71ce249411 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -85,6 +85,15 @@ class CompactedDBImpl : public DBImpl { const IngestExternalFileOptions& /*ingestion_options*/) override { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } private: friend class DB; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 6f2ebdc8098..af9aea011a3 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -33,6 +33,7 @@ #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/external_sst_file_ingestion_job.h" +#include "db/import_column_family_job.h" #include "db/flush_job.h" #include "db/forward_iterator.h" #include "db/job_context.h" @@ -3894,6 +3895,127 @@ Status DBImpl::IngestExternalFiles( return status; } +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = reinterpret_cast(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, + immutable_db_options_, env_options_, + import_options, metadata.files); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::list::iterator pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem = CaptureCurrentFileNumberInPendingOutputs(); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = + versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + DropColumnFamily(*handle); + DestroyColumnFamilyHandle(*handle); + *handle = nullptr; + } + return status; +} + Status DBImpl::VerifyChecksum() { Status s; std::vector cfd_list; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index d417035b1ef..547e3e1d6be 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -27,6 +27,7 @@ #include "db/external_sst_file_ingestion_job.h" #include "db/flush_job.h" #include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" #include "db/internal_stats.h" #include "db/log_writer.h" #include "db/logs_with_prep_tracker.h" @@ -356,6 +357,13 @@ class DBImpl : public DB { virtual Status IngestExternalFiles( const std::vector& args) override; + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + virtual Status VerifyChecksum() override; using DB::StartTrace; @@ -1803,7 +1811,8 @@ class DBImpl : public DB { std::string db_absolute_path_; - // Number of running IngestExternalFile() calls. + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. // REQUIRES: mutex held int num_running_ingest_file_; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 18df900cba0..ad307677ccc 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -115,6 +115,16 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + private: friend class DB; diff --git a/db/db_test.cc b/db/db_test.cc index 69e91923cd6..36bdda59e21 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2492,6 +2492,16 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented"); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not implemented."); + } + Status VerifyChecksum() override { return Status::NotSupported("Not implemented."); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc new file mode 100644 index 00000000000..3c00a25917d --- /dev/null +++ b/db/import_column_family_job.cc @@ -0,0 +1,257 @@ +#ifndef ROCKSDB_LITE + +#include "db/import_column_family_job.h" + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "file/file_util.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "util/file_reader_writer.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are importing + for (const auto& file_metadata : metadata_) { + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = GetIngestedFileInfo(file_path, &file_to_import, sv); + if (!status.ok()) { + return status; + } + files_to_import_.push_back(file_to_import); + } + + const auto ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_import_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges in any particular + // level. + int min_level = 1; // Check for overlaps in Level 1 and above. + int max_level = -1; + for (const auto& file_metadata : metadata_) { + if (file_metadata.level > max_level) { + max_level = file_metadata.level; + } + } + for (int level = min_level; level <= max_level; ++level) { + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + if (metadata_[i].level == level) { + sorted_files.push_back(&files_to_import_[i]); + } + } + + std::sort(sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, + const IngestedFileInfo* info2) { + return ucmp->Compare(info1->smallest_user_key, + info2->smallest_user_key) < 0; + }); + + for (size_t i = 0; i < sorted_files.size() - 1; i++) { + if (ucmp->Compare(sorted_files[i]->largest_user_key, + sorted_files[i + 1]->smallest_user_key) >= 0) { + return Status::InvalidArgument("Files have overlapping ranges"); + } + } + } + } + + for (const auto& f : files_to_import_) { + if (f.num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!f.smallest_internal_key().Valid() || + !f.largest_internal_key().Valid()) { + return Status::Corruption("File has corrupted keys"); + } + } + + // Copy/Move external files into DB + auto hardlink_files = import_options_.move_files; + for (auto& f : files_to_import_) { + f.fd = FileDescriptor(next_file_number++, 0, f.file_size); + + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = env_->LinkFile(path_outside_db, path_inside_db); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + hardlink_files = false; + } + } + if (!hardlink_files) { + status = CopyFile(env_, path_outside_db, path_inside_db, 0, + db_options_.use_fsync); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; + } + + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (const auto& f : files_to_import_) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ImportColumnFamilyJob::Run() { + Status status; + edit_.SetColumnFamily(cfd_->GetID()); + + for (size_t i = 0; i < files_to_import_.size(); ++i) { + const auto& f = files_to_import_[i]; + const auto& file_metadata = metadata_[i]; + edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key(), + f.largest_internal_key(), file_metadata.smallest_seqno, + file_metadata.largest_seqno, false); + + // If incoming sequence number is higher, update local sequence number. + if (file_metadata.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_metadata.largest_seqno); + versions_->SetLastPublishedSequence(file_metadata.largest_seqno); + versions_->SetLastSequence(file_metadata.largest_seqno); + } + } + + return status; +} + +void ImportColumnFamilyJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add files to the database remove all the files we copied. + for (const auto& f : files_to_import_) { + const auto s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && import_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_import_) { + const auto s = env_->DeleteFile(f.external_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ImportColumnFamilyJob::GetIngestedFileInfo( + const std::string& external_file, IngestedFileInfo* file_to_import, + SuperVersion* sv) { + file_to_import->external_file_path = external_file; + + // Get external file size + auto status = env_->GetFileSize(external_file, &file_to_import->file_size); + if (!status.ok()) { + return status; + } + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = env_->NewRandomAccessFile(external_file, &sst_file, env_options_); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file), + external_file)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions(*cfd_->ioptions(), + sv->mutable_cf_options.prefix_extractor.get(), + env_options_, cfd_->internal_comparator()), + std::move(sst_file_reader), file_to_import->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + + // Set original_seqno to 0. + file_to_import->original_seqno = 0; + + // Get number of entries in table + file_to_import->num_entries = props->num_entries; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->smallest_user_key = key.user_key.ToString(); + + // Get last (largest) key from file + iter->SeekToLast(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->largest_user_key = key.user_key.ToString(); + + file_to_import->cf_id = static_cast(props->column_family_id); + + file_to_import->table_properties = *props; + + return status; +} + +} // namespace rocksdb + +#endif // !ROCKSDB_LITE diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h new file mode 100644 index 00000000000..5b8577df1d5 --- /dev/null +++ b/db/import_column_family_job.h @@ -0,0 +1,70 @@ +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/snapshot_impl.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace rocksdb { + +// Imports a set of sst files as is into a new column family. Logic is similar +// to ExternalSstFileIngestionJob. +class ImportColumnFamilyJob { + public: + ImportColumnFamilyJob( + Env* env, VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadata) + : env_(env), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + env_options_(env_options), + import_options_(import_options), + metadata_(metadata) {} + + // Prepare the job by copying external files into the DB. + Status Prepare(uint64_t next_file_number, SuperVersion* sv); + + // Will execute the import job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_import() const { + return files_to_import_; + } + + private: + // Open the external file and populate `file_to_import` with all the + // external information we need to import this file. + Status GetIngestedFileInfo(const std::string& external_file, + IngestedFileInfo* file_to_import, + SuperVersion* sv); + + Env* env_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const EnvOptions& env_options_; + autovector files_to_import_; + VersionEdit edit_; + const ImportColumnFamilyOptions& import_options_; + std::vector metadata_; +}; + +} // namespace rocksdb diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc new file mode 100644 index 00000000000..a93ecbf1173 --- /dev/null +++ b/db/import_column_family_test.cc @@ -0,0 +1,565 @@ +#ifndef ROCKSDB_LITE + +#include +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testutil.h" + +namespace rocksdb { + +class ImportColumnFamilyTest : public DBTestBase { + public: + ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + export_files_dir_ = test::TmpDir(env_) + "/export"; + import_cfh_ = nullptr; + import_cfh2_ = nullptr; + metadata_ptr_ = nullptr; + } + + ~ImportColumnFamilyTest() { + if (import_cfh_) { + db_->DropColumnFamily(import_cfh_); + db_->DestroyColumnFamilyHandle(import_cfh_); + import_cfh_ = nullptr; + } + if (import_cfh2_) { + db_->DropColumnFamily(import_cfh2_); + db_->DestroyColumnFamilyHandle(import_cfh2_); + import_cfh2_ = nullptr; + } + if (metadata_ptr_) { + delete metadata_ptr_; + metadata_ptr_ = nullptr; + } + test::DestroyDir(env_, sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + LiveFileMetaData LiveFileMetaDataInit(std::string name, + std::string path, + int level, + SequenceNumber smallest_seqno, + SequenceNumber largest_seqno) { + LiveFileMetaData metadata; + metadata.name = name; + metadata.db_path = path; + metadata.smallest_seqno = smallest_seqno; + metadata.largest_seqno = largest_seqno; + metadata.level = level; + return metadata; + } + + protected: + std::string sst_files_dir_; + std::string export_files_dir_; + ColumnFamilyHandle* import_cfh_; + ColumnFamilyHandle* import_cfh2_; + ExportImportFilesMetaData *metadata_ptr_; +}; + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst_name = "cf_unknown.sst"; + const std::string unknown_sst = sst_files_dir_ + unknown_sst_name; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K3", "V1")); + ASSERT_OK(sfw_unknown.Put("K4", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + { + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K1", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K2", &value); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + } + + { + // Import sst file corresponding to unknown cf onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K3", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K4", &value); + ASSERT_EQ(value, "V2"); + } +} + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + + // file3.sst + const std::string file3_sst_name = "file3.sst"; + const std::string file3_sst = sst_files_dir_ + file3_sst_name; + ASSERT_OK(sfw_cf1.Open(file3_sst)); + for (int i = 0; i < 100; ++i) { + sfw_cf1.Put(Key(i), Key(i) + "_val"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file2.sst + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + for (int i = 0; i < 100; i += 2) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1a.sst + const std::string file1a_sst_name = "file1a.sst"; + const std::string file1a_sst = sst_files_dir_ + file1a_sst_name; + ASSERT_OK(sfw_cf1.Open(file1a_sst)); + for (int i = 0; i < 52; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1b.sst + const std::string file1b_sst_name = "file1b.sst"; + const std::string file1b_sst = sst_files_dir_ + file1b_sst_name; + ASSERT_OK(sfw_cf1.Open(file1b_sst)); + for (int i = 52; i < 100; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0a.sst + const std::string file0a_sst_name = "file0a.sst"; + const std::string file0a_sst = sst_files_dir_ + file0a_sst_name; + ASSERT_OK(sfw_cf1.Open(file0a_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0b.sst + const std::string file0b_sst_name = "file0b.sst"; + const std::string file0b_sst = sst_files_dir_ + file0b_sst_name; + ASSERT_OK(sfw_cf1.Open(file0b_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst files and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29)); + metadata.files.push_back( + LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34)); + metadata.files.push_back( + LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39)); + metadata.files.push_back( + LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49)); + metadata.files.push_back( + LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + for (int i = 0; i < 100; i += 5) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5")); + } + + // Flush and check again + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + // Compact and check again. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = NULL; + + std::string value1, value2; + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Get(1, Key(i)), value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Modify keys in cf1 and verify. + for (int i = 0; i < 25; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i))); + } + for (int i = 25; i < 50; i++) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3")); + } + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Compact and check again. + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + // Compact to create a L1 file. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 50; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + + for (int i = 0; i < 25; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + + // Create a new db and import the files. + DB* db_copy; + test::DestroyDir(env_, dbname_ + "/db_copy"); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + for (int i = 0; i < 100; ++i) { + std::string value; + db_copy->Get(ReadOptions(), cfh, Key(i), &value); + ASSERT_EQ(Get(1, Key(i)), value); + } + db_copy->DropColumnFamily(cfh); + test::DestroyDir(env_, dbname_ + "/db_copy"); +} + +TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + { + // Create column family with existing cf name. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Column family already exists")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with no files specified. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("The list of files is empty")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with overlapping keys in sst files. + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K3", "V3")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Files have overlapping ranges")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with a mismatching comparator, should fail with appropriate error. + ExportImportFilesMetaData metadata; + Options mismatch_options = CurrentOptions(); + mismatch_options.comparator = ReverseBytewiseComparator(); + SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = mismatch_options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Comparator name mismatch")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with non existent sst file should fail with appropriate error + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file3_sst_name = "file3.sst"; + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::IOError("No such file or directory")); + ASSERT_EQ(import_cfh_, nullptr); + + // Test successful import after a failure with the same CF name. Ensures + // there is no side effect with CF when there is a failed import + metadata.files.pop_back(); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + } + +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int argc, char** argv) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Import are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0f8573e4319..d90ca900f45 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1174,6 +1174,27 @@ class DB { virtual Status IngestExternalFiles( const std::vector& args) = 0; + // CreateColumnFamilyWithImport() will create a new column family with + // column_family_name and import external SST files specified in metadata into + // this column family. + // (1) External SST files can be created using SstFileWriter. + // (2) External SST files can be exported from a particular column family in + // an existing DB. + // Option in import_options specifies whether the external files are copied or + // moved (default is copy). When option specifies copy, managing files at + // external_file_path is caller's responsibility. When option specifies a + // move, the call ensures that the specified files at external_file_path are + // deleted on successful return and files are not modified on any error + // return. + // On error return, column family handle returned will be nullptr. + // ColumnFamily will be present on successful return and will not be present + // on error return. ColumnFamily may be present on any crash during this call. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) = 0; + virtual Status VerifyChecksum() = 0; // AddFile() is deprecated, please use IngestExternalFile() diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index a0ab41efdfb..7b251eb7203 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -108,4 +108,11 @@ struct LiveFileMetaData : SstFileMetaData { int level; // Level at which this file resides. LiveFileMetaData() : column_family_name(), level(0) {} }; + +// Metadata returned as output from ExportColumnFamily() and used as input to +// CreateColumnFamiliesWithImport(). +struct ExportImportFilesMetaData { + std::string db_comparator_name; // Used to safety check at import. + std::vector files; // Vector of file metadata. +}; } // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8ebcd292dba..09dc8e54c5c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1491,4 +1491,10 @@ struct TraceOptions { uint64_t filter = kTraceFilterNone; }; +// ImportColumnFamilyOptions is used by ImportColumnFamily() +struct ImportColumnFamilyOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; +}; + } // namespace rocksdb diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index aa0a394d4d0..5f12922c454 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -9,11 +9,15 @@ #ifndef ROCKSDB_LITE #include +#include #include "rocksdb/status.h" namespace rocksdb { class DB; +class ColumnFamilyHandle; +struct LiveFileMetaData; +struct ExportImportFilesMetaData; class Checkpoint { public: @@ -36,6 +40,16 @@ class Checkpoint { virtual Status CreateCheckpoint(const std::string& checkpoint_dir, uint64_t log_size_for_flush = 0); + // Exports all live SST files of a specified Column Family onto export_dir, + // returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata); + virtual ~Checkpoint() {} }; diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 8535952cd3e..a52aff5d8b1 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -120,6 +120,16 @@ class StackableDB : public DB { return db_->IngestExternalFiles(args); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadata, handle); + } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } using DB::KeyMayExist; diff --git a/src.mk b/src.mk index fe930d5f49b..4d635173b89 100644 --- a/src.mk +++ b/src.mk @@ -36,6 +36,7 @@ LIB_SOURCES = \ db/flush_job.cc \ db/flush_scheduler.cc \ db/forward_iterator.cc \ + db/import_column_family_job.cc \ db/internal_stats.cc \ db/logs_with_prep_tracker.cc \ db/log_reader.cc \ diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 4835f26da6e..0639ed2f2b4 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -22,6 +22,7 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/metadata.h" #include "rocksdb/transaction_log.h" #include "rocksdb/utilities/checkpoint.h" #include "test_util/sync_point.h" @@ -60,6 +61,12 @@ void CheckpointImpl::CleanStagingDirectory( full_private_path.c_str(), s.ToString().c_str()); } +Status Checkpoint::ExportColumnFamily( + ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/, + ExportImportFilesMetaData** /*metadata*/) { + return Status::NotSupported(""); +} + // Builds an openable snapshot of RocksDB Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, uint64_t log_size_for_flush) { @@ -322,6 +329,184 @@ Status CheckpointImpl::CreateCustomCheckpoint( return s; } +// Exports all live SST files of a specified Column Family onto export_dir, +// returning SST files information in metadata. +Status CheckpointImpl::ExportColumnFamily( + ColumnFamilyHandle* handle, const std::string& export_dir, + ExportImportFilesMetaData** metadata) { + auto cfh = reinterpret_cast(handle); + const auto cf_name = cfh->GetName(); + const auto db_options = db_->GetDBOptions(); + + assert(metadata != nullptr); + assert(*metadata == nullptr); + auto s = db_->GetEnv()->FileExists(export_dir); + if (s.ok()) { + return Status::InvalidArgument("Specified export_dir exists"); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + const auto final_nonslash_idx = export_dir.find_last_not_of('/'); + if (final_nonslash_idx == std::string::npos) { + return Status::InvalidArgument("Specified export_dir invalid"); + } + ROCKS_LOG_INFO(db_options.info_log, + "[%s] export column family onto export directory %s", + cf_name.c_str(), export_dir.c_str()); + + // Create a temporary export directory. + const auto tmp_export_dir = + export_dir.substr(0, final_nonslash_idx + 1) + ".tmp"; + s = db_->GetEnv()->CreateDir(tmp_export_dir); + + if (s.ok()) { + s = db_->Flush(rocksdb::FlushOptions(), handle); + } + + ColumnFamilyMetaData db_metadata; + if (s.ok()) { + // Export live sst files with file deletions disabled. + s = db_->DisableFileDeletions(); + if (s.ok()) { + db_->GetColumnFamilyMetaData(handle, &db_metadata); + + s = ExportFilesInMetaData( + db_options, db_metadata, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s", + cf_name.c_str(), fname.c_str()); + return db_->GetEnv()->LinkFile(src_dirname + fname, + tmp_export_dir + fname); + } /*link_file_cb*/, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s", + cf_name.c_str(), fname.c_str()); + return CopyFile(db_->GetEnv(), src_dirname + fname, + tmp_export_dir + fname, 0, db_options.use_fsync); + } /*copy_file_cb*/); + + const auto enable_status = db_->EnableFileDeletions(false /*force*/); + if (s.ok()) { + s = enable_status; + } + } + } + + auto moved_to_user_specified_dir = false; + if (s.ok()) { + // Move temporary export directory to the actual export directory. + s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir); + } + + if (s.ok()) { + // Fsync export directory. + moved_to_user_specified_dir = true; + std::unique_ptr dir_ptr; + s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr); + if (s.ok()) { + assert(dir_ptr != nullptr); + s = dir_ptr->Fsync(); + } + } + + if (s.ok()) { + // Export of files succeeded. Fill in the metadata information. + auto result_metadata = new ExportImportFilesMetaData(); + result_metadata->db_comparator_name = handle->GetComparator()->Name(); + for (const auto& level_metadata : db_metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + LiveFileMetaData live_file_metadata; + live_file_metadata.size = file_metadata.size; + live_file_metadata.name = std::move(file_metadata.name); + live_file_metadata.db_path = export_dir; + live_file_metadata.smallest_seqno = file_metadata.smallest_seqno; + live_file_metadata.largest_seqno = file_metadata.largest_seqno; + live_file_metadata.smallestkey = std::move(file_metadata.smallestkey); + live_file_metadata.largestkey = std::move(file_metadata.largestkey); + live_file_metadata.level = level_metadata.level; + result_metadata->files.push_back(live_file_metadata); + } + *metadata = result_metadata; + } + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.", + cf_name.c_str()); + } else { + // Failure: Clean up all the files/directories created. + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s", + cf_name.c_str(), s.ToString().c_str()); + std::vector subchildren; + const auto cleanup_dir = + moved_to_user_specified_dir ? export_dir : tmp_export_dir; + db_->GetEnv()->GetChildren(cleanup_dir, &subchildren); + for (const auto& subchild : subchildren) { + const auto subchild_path = cleanup_dir + "/" + subchild; + const auto status = db_->GetEnv()->DeleteFile(subchild_path); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s", + subchild_path.c_str(), status.ToString().c_str()); + } + } + const auto status = db_->GetEnv()->DeleteDir(cleanup_dir); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s", + cleanup_dir.c_str(), status.ToString().c_str()); + } + } + return s; +} + +Status CheckpointImpl::ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb) { + Status s; + auto hardlink_file = true; + + // Copy/hard link files in metadata. + size_t num_files = 0; + for (const auto& level_metadata : metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + uint64_t number; + FileType type; + const auto ok = ParseFileName(file_metadata.name, &number, &type); + if (!ok) { + s = Status::Corruption("Could not parse file name"); + break; + } + + // We should only get sst files here. + assert(type == kTableFile); + assert(file_metadata.size > 0 && file_metadata.name[0] == '/'); + const auto src_fname = file_metadata.name; + ++num_files; + + if (hardlink_file) { + s = link_file_cb(db_->GetName(), src_fname); + if (num_files == 1 && s.IsNotSupported()) { + // Fallback to copy if link failed due to cross-device directories. + hardlink_file = false; + s = Status::OK(); + } + } + if (!hardlink_file) { + s = copy_file_cb(db_->GetName(), src_fname); + } + if (!s.ok()) { + break; + } + } + } + ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt, + num_files); + + return s; +} } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index d26a9f66bfc..0d87b635b8d 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -30,6 +30,17 @@ class CheckpointImpl : public Checkpoint { virtual Status CreateCheckpoint(const std::string& checkpoint_dir, uint64_t log_size_for_flush) override; + // Exports all live SST files of a specified Column Family onto export_dir + // and returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + using Checkpoint::ExportColumnFamily; + virtual Status ExportColumnFamily( + ColumnFamilyHandle* handle, const std::string& export_dir, + ExportImportFilesMetaData** metadata) override; + // Checkpoint logic can be customized by providing callbacks for link, copy, // or create. Status CreateCustomCheckpoint( @@ -48,6 +59,18 @@ class CheckpointImpl : public Checkpoint { private: void CleanStagingDirectory(const std::string& path, Logger* info_log); + + // Export logic customization by providing callbacks for link or copy. + Status ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb); + + private: DB* db_; }; diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index d7d2548af3e..d748f500ebc 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -26,6 +26,7 @@ #include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { class CheckpointTest : public testing::Test { @@ -44,6 +45,9 @@ class CheckpointTest : public testing::Test { Options last_options_; std::vector handles_; std::string snapshot_name_; + std::string export_path_; + ColumnFamilyHandle* cfh_reverse_comp_; + ExportImportFilesMetaData* metadata_; CheckpointTest() : env_(Env::Default()) { env_->SetBackgroundThreads(1, Env::LOW); @@ -64,12 +68,24 @@ class CheckpointTest : public testing::Test { EXPECT_OK(DestroyDB(snapshot_tmp_name, options)); env_->DeleteDir(snapshot_tmp_name); Reopen(options); + export_path_ = test::TmpDir(env_) + "/export"; + test::DestroyDir(env_, export_path_); + cfh_reverse_comp_ = nullptr; + metadata_ = nullptr; } ~CheckpointTest() override { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->LoadDependency({}); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + if (cfh_reverse_comp_) { + EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_)); + cfh_reverse_comp_ = nullptr; + } + if (metadata_) { + delete metadata_; + metadata_ = nullptr; + } Close(); Options options; options.db_paths.emplace_back(dbname_, 0); @@ -78,6 +94,7 @@ class CheckpointTest : public testing::Test { options.db_paths.emplace_back(dbname_ + "_4", 0); EXPECT_OK(DestroyDB(dbname_, options)); EXPECT_OK(DestroyDB(snapshot_name_, options)); + test::DestroyDir(env_, export_path_); } // Return the current option configuration. @@ -140,6 +157,12 @@ class CheckpointTest : public testing::Test { ASSERT_OK(TryReopen(options)); } + void CompactAll() { + for (auto h : handles_) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr)); + } + } + void Close() { for (auto h : handles_) { delete h; @@ -289,6 +312,109 @@ TEST_F(CheckpointTest, GetSnapshotLink) { } } +TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) { + // Create a database + Status s; + auto options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({}, options); + + // Helper to verify the number of files in metadata and export dir + auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata, + int num_files_expected) { + ASSERT_EQ(metadata.files.size(), num_files_expected); + std::vector subchildren; + env_->GetChildren(export_path_, &subchildren); + int num_children = 0; + for (const auto& child : subchildren) { + if (child != "." && child != "..") { + ++num_children; + } + } + ASSERT_EQ(num_children, num_files_expected); + }; + + // Test DefaultColumnFamily + { + const auto key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export the Tables and verify + ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_)); + verify_files_exported(*metadata_, 1); + ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name()); + test::DestroyDir(env_, export_path_); + delete metadata_; + metadata_ = nullptr; + + // Check again after compaction + CompactAll(); + ASSERT_OK(Put(key, "v2")); + ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_)); + verify_files_exported(*metadata_, 2); + ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name()); + test::DestroyDir(env_, export_path_); + delete metadata_; + metadata_ = nullptr; + delete checkpoint; + } + + // Test non default column family with non default comparator + { + auto cf_options = CurrentOptions(); + cf_options.comparator = ReverseBytewiseComparator(); + ASSERT_OK( + db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_)); + + const auto key = std::string("foo"); + ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export the Tables and verify + ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_, + &metadata_)); + verify_files_exported(*metadata_, 1); + ASSERT_EQ(metadata_->db_comparator_name, + ReverseBytewiseComparator()->Name()); + delete checkpoint; + } +} + +TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) { + // Create a database + Status s; + auto options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({}, options); + + const auto key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export onto existing directory + env_->CreateDirIfMissing(export_path_); + ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_), + Status::InvalidArgument("Specified export_dir exists")); + test::DestroyDir(env_, export_path_); + + // Export with invalid directory specification + export_path_ = ""; + ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_), + Status::InvalidArgument("Specified export_dir invalid")); + delete checkpoint; +} + TEST_F(CheckpointTest, CheckpointCF) { Options options = CurrentOptions(); CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options); From 8a008d41701823af69c2185c7460280c5d8fac74 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Wed, 17 Jul 2019 13:02:00 -0700 Subject: [PATCH 231/572] Block access tracing: Trace referenced key for Get on non-data blocks. (#5548) Summary: This PR traces the referenced key for Get for all types of blocks. This is useful when evaluating hybrid row-block caches. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5548 Test Plan: make clean && USE_CLANG=1 make check -j32 Differential Revision: D16157979 Pulled By: HaoyuHuang fbshipit-source-id: f6327411c9deb74e35e22a35f66cdbae09ab9d87 --- table/block_based/block_based_table_reader.cc | 58 ++-- table/table_test.cc | 276 +++++++++++++++++- tools/block_cache_trace_analyzer.h | 4 +- trace_replay/block_cache_tracer.cc | 34 ++- trace_replay/block_cache_tracer.h | 40 ++- trace_replay/block_cache_tracer_test.cc | 9 +- utilities/simulator_cache/cache_simulator.cc | 19 +- utilities/simulator_cache/cache_simulator.h | 1 - .../simulator_cache/cache_simulator_test.cc | 22 +- 9 files changed, 386 insertions(+), 77 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a888603d72b..fde11c0d362 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1983,10 +1983,12 @@ CachableEntry BlockBasedTable::GetUncompressionDict( /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io, lookup_context->get_id); + /*no_insert=*/no_io, lookup_context->get_id, + lookup_context->get_from_user_specified_snapshot, + /*referenced_key=*/""); block_cache_tracer_->WriteBlockAccess(access_record, cache_key, rep_->cf_name_for_tracing(), - /*referenced_key=*/nullptr); + lookup_context->referenced_key); } return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, cache_handle, false /* own_value */}; @@ -2237,7 +2239,6 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( Slice key /* key to the block cache */; Slice ckey /* key to the compressed block cache */; bool is_cache_hit = false; - bool no_insert = true; if (block_cache != nullptr || block_cache_compressed != nullptr) { // create key for block cache if (block_cache != nullptr) { @@ -2265,7 +2266,6 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( // Can't find the block from the cache. If I/O is allowed, read from the // file. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { - no_insert = false; Statistics* statistics = rep_->ioptions.statistics; const bool maybe_compressed = block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; @@ -2332,11 +2332,11 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( assert(false); break; } - if (BlockCacheTraceHelper::ShouldTraceReferencedKey( + bool no_insert = no_io || !ro.fill_cache; + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( trace_block_type, lookup_context->caller)) { // Defer logging the access to Get() and MultiGet() to trace additional - // information, e.g., the referenced key, - // referenced_key_exist_in_block. + // information, e.g., referenced_key_exist_in_block. // Make a copy of the block key here since it will be logged later. lookup_context->FillLookupContext( @@ -2351,10 +2351,12 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - no_insert, lookup_context->get_id); + no_insert, lookup_context->get_id, + lookup_context->get_from_user_specified_snapshot, + /*referenced_key=*/""); block_cache_tracer_->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(), - /*referenced_key=*/nullptr); + lookup_context->referenced_key); } } @@ -3288,12 +3290,18 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, // First check the full filter // If full filter not useful, Then go into each block uint64_t tracing_get_id = get_context->get_tracing_get_id(); - BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, - tracing_get_id}; + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Trace the key since it contains both user key and sequence number. + lookup_context.referenced_key = key.ToString(); + lookup_context.get_from_user_specified_snapshot = + read_options.snapshot != nullptr; + } const bool may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, get_context, &lookup_context); - if (!may_match) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); @@ -3347,7 +3355,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } BlockCacheLookupContext lookup_data_block_context{ - TableReaderCaller::kUserGet, tracing_get_id}; + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr}; bool does_referenced_key_exist = false; DataBlockIter biter; uint64_t referenced_data_size = 0; @@ -3406,7 +3416,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, if (does_referenced_key_exist) { referenced_key = biter.key(); } else { - referenced_key = ExtractUserKey(key); + referenced_key = key; } BlockCacheTraceRecord access_record( rep_->ioptions.env->NowMicros(), @@ -3417,6 +3427,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, lookup_data_block_context.is_cache_hit, lookup_data_block_context.no_insert, lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); @@ -3460,8 +3471,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } - BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet, - tracing_mget_id}; + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, prefix_extractor, &lookup_context); @@ -3492,11 +3504,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); - BlockCacheLookupContext lookup_compression_dict_context( - TableReaderCaller::kUserMultiGet); - auto uncompression_dict_storage = GetUncompressionDict(nullptr, no_io, - sst_file_range.begin()->get_context, - &lookup_compression_dict_context); + auto uncompression_dict_storage = GetUncompressionDict( + nullptr, no_io, sst_file_range.begin()->get_context, &lookup_context); const UncompressionDict& uncompression_dict = uncompression_dict_storage.GetValue() == nullptr ? UncompressionDict::GetEmptyDict() @@ -3591,7 +3600,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet, tracing_mget_id); + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr); if (first_block) { if (!block_handles[idx_in_batch].IsNull() || !results[idx_in_batch].IsEmpty()) { @@ -3685,7 +3696,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, if (does_referenced_key_exist) { referenced_key = biter->key(); } else { - referenced_key = ExtractUserKey(key); + referenced_key = key; } BlockCacheTraceRecord access_record( rep_->ioptions.env->NowMicros(), @@ -3696,6 +3707,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, lookup_data_block_context.is_cache_hit, lookup_data_block_context.no_insert, lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); diff --git a/table/table_test.cc b/table/table_test.cc index c54933b781a..bb034311668 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -63,6 +63,8 @@ extern const uint64_t kPlainTableMagicNumber; namespace { +const std::string kDummyValue(10000, 'o'); + // DummyPropertiesCollector used to test BlockBasedTableProperties class DummyPropertiesCollector : public TablePropertiesCollector { public: @@ -312,7 +314,9 @@ class TableConstructor: public Constructor { : Constructor(cmp), largest_seqno_(largest_seqno), convert_to_internal_key_(convert_to_internal_key), - level_(level) {} + level_(level) { + env_ = rocksdb::Env::Default(); + } ~TableConstructor() override { Reset(); } Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, @@ -371,7 +375,7 @@ class TableConstructor: public Constructor { return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, - level_, largest_seqno_, nullptr), + level_, largest_seqno_, &block_cache_tracer_), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -425,6 +429,8 @@ class TableConstructor: public Constructor { return static_cast(file_writer_->writable_file()); } + BlockCacheTracer block_cache_tracer_; + private: void Reset() { uniq_id_ = 0; @@ -445,6 +451,7 @@ class TableConstructor: public Constructor { static uint64_t cur_uniq_id_; EnvOptions soptions; + Env* env_; }; uint64_t TableConstructor::cur_uniq_id_ = 1; @@ -1063,7 +1070,9 @@ class BlockBasedTableTest : public TableTest, virtual public ::testing::WithParamInterface { public: - BlockBasedTableTest() : format_(GetParam()) {} + BlockBasedTableTest() : format_(GetParam()) { + env_ = rocksdb::Env::Default(); + } BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions options; @@ -1071,11 +1080,91 @@ class BlockBasedTableTest return options; } + void SetupTracingTest(TableConstructor* c) { + test_path_ = test::PerThreadDBPath("block_based_table_tracing_test"); + EXPECT_OK(env_->CreateDir(test_path_)); + trace_file_path_ = test_path_ + "/block_cache_trace_file"; + TraceOptions trace_opt; + std::unique_ptr trace_writer; + EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_, + &trace_writer)); + c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer)); + { + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c->Add(encoded_key, kDummyValue); + } + { + std::string user_key = "k02"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c->Add(encoded_key, kDummyValue); + } + } + + void VerifyBlockAccessTrace( + TableConstructor* c, + const std::vector& expected_records) { + c->block_cache_tracer_.EndTrace(); + + std::unique_ptr trace_reader; + Status s = + NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + EXPECT_OK(s); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + EXPECT_OK(reader.ReadHeader(&header)); + uint32_t index = 0; + while (s.ok()) { + BlockCacheTraceRecord access; + s = reader.ReadAccess(&access); + if (!s.ok()) { + break; + } + ASSERT_LT(index, expected_records.size()); + EXPECT_NE("", access.block_key); + EXPECT_EQ(access.block_type, expected_records[index].block_type); + EXPECT_GT(access.block_size, 0); + EXPECT_EQ(access.caller, expected_records[index].caller); + EXPECT_EQ(access.no_insert, expected_records[index].no_insert); + EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit); + // Get + if (access.caller == TableReaderCaller::kUserGet) { + EXPECT_EQ(access.referenced_key, + expected_records[index].referenced_key); + EXPECT_EQ(access.get_id, expected_records[index].get_id); + EXPECT_EQ(access.get_from_user_specified_snapshot, + expected_records[index].get_from_user_specified_snapshot); + if (access.block_type == TraceType::kBlockTraceDataBlock) { + EXPECT_GT(access.referenced_data_size, 0); + EXPECT_GT(access.num_keys_in_block, 0); + EXPECT_EQ(access.referenced_key_exist_in_block, + expected_records[index].referenced_key_exist_in_block); + } + } else { + EXPECT_EQ(access.referenced_key, ""); + EXPECT_EQ(access.get_id, 0); + EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse); + EXPECT_EQ(access.referenced_data_size, 0); + EXPECT_EQ(access.num_keys_in_block, 0); + EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse); + } + index++; + } + EXPECT_EQ(index, expected_records.size()); + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + EXPECT_OK(env_->DeleteDir(test_path_)); + } + protected: uint64_t IndexUncompressedHelper(bool indexCompress); private: uint32_t format_; + Env* env_; + std::string trace_file_path_; + std::string test_path_; }; class PlainTableTest : public TableTest {}; class TablePropertyTest : public testing::Test {}; @@ -2211,6 +2300,187 @@ TEST_P(BlockBasedTableTest, NumBlockStat) { c.ResetTableReader(); } +TEST_P(BlockBasedTableTest, TracingGetTest) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + for (uint32_t i = 1; i <= 2; i++) { + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, /*get_id=*/i); + get_perf_context()->Reset(); + ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value.ToString(), kDummyValue); + } + + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have three records for one index, one filter, and one data + // block access. + record.get_id = 1; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserGet; + record.get_from_user_specified_snapshot = Boolean::kFalse; + record.referenced_key = encoded_key; + record.referenced_key_exist_in_block = Boolean::kTrue; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + record.is_cache_hit = Boolean::kFalse; + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + // The second get should all observe cache hits. + record.is_cache_hit = Boolean::kTrue; + record.get_id = 2; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserGet; + record.get_from_user_specified_snapshot = Boolean::kFalse; + record.referenced_key = encoded_key; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + for (uint32_t i = 1; i <= 2; i++) { + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c.GetTableReader()->ApproximateOffsetOf( + encoded_key, TableReaderCaller::kUserApproximateSize); + } + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have two records for only index blocks. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserApproximateSize; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, TracingIterator) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + for (uint32_t i = 1; i <= 2; i++) { + std::unique_ptr iter(c.GetTableReader()->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUserIterator)); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + iter.reset(); + } + + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have three records for index and two data block access. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserIterator; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + record.is_cache_hit = Boolean::kFalse; + expected_records.push_back(record); + expected_records.push_back(record); + // When we iterate this file for the second time, we should observe all cache + // hits. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + // A simple tool that takes the snapshot of block cache statistics. class BlockCachePropertiesSnapshot { public: diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index feb7c21f22c..32a90342cb1 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -57,8 +57,8 @@ struct BlockAccessInfo { const uint64_t timestamp_in_seconds = access.access_timestamp / kMicrosInSecond; caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; - if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type, - access.caller)) { + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type, + access.caller)) { num_keys = access.num_keys_in_block; if (access.referenced_key_exist_in_block == Boolean::kTrue) { if (key_num_access_map.find(access.referenced_key) == diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index a74dc4d58cb..4f320ef2d0f 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -35,14 +35,13 @@ const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = "UnknownColumnFamily"; const uint64_t BlockCacheTraceHelper::kReservedGetId = 0; -bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type, - TableReaderCaller caller) { +bool BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + TraceType block_type, TableReaderCaller caller) { return (block_type == TraceType::kBlockTraceDataBlock) && - (caller == TableReaderCaller::kUserGet || - caller == TableReaderCaller::kUserMultiGet); + IsGetOrMultiGet(caller); } -bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) { +bool BlockCacheTraceHelper::IsGetOrMultiGet(TableReaderCaller caller) { return caller == TableReaderCaller::kUserGet || caller == TableReaderCaller::kUserMultiGet; } @@ -81,12 +80,13 @@ Status BlockCacheTraceWriter::WriteBlockAccess( trace.payload.push_back(record.caller); trace.payload.push_back(record.is_cache_hit); trace.payload.push_back(record.no_insert); - if (BlockCacheTraceHelper::ShouldTraceGetId(record.caller)) { + if (BlockCacheTraceHelper::IsGetOrMultiGet(record.caller)) { PutFixed64(&trace.payload, record.get_id); - } - if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type, - record.caller)) { + trace.payload.push_back(record.get_from_user_specified_snapshot); PutLengthPrefixedSlice(&trace.payload, referenced_key); + } + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record.block_type, + record.caller)) { PutFixed64(&trace.payload, record.referenced_data_size); PutFixed64(&trace.payload, record.num_keys_in_block); trace.payload.push_back(record.referenced_key_exist_in_block); @@ -216,20 +216,28 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { } record->no_insert = static_cast(enc_slice[0]); enc_slice.remove_prefix(kCharSize); - if (BlockCacheTraceHelper::ShouldTraceGetId(record->caller)) { + if (BlockCacheTraceHelper::IsGetOrMultiGet(record->caller)) { if (!GetFixed64(&enc_slice, &record->get_id)) { return Status::Incomplete( "Incomplete access record: Failed to read the get id."); } - } - if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type, - record->caller)) { + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read " + "get_from_user_specified_snapshot."); + } + record->get_from_user_specified_snapshot = + static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); Slice referenced_key; if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) { return Status::Incomplete( "Incomplete access record: Failed to read the referenced key."); } record->referenced_key = referenced_key.ToString(); + } + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record->block_type, + record->caller)) { if (!GetFixed64(&enc_slice, &record->referenced_data_size)) { return Status::Incomplete( "Incomplete access record: Failed to read the referenced data size."); diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 3b26a18d639..b1a258843e5 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -23,9 +23,9 @@ extern const uint64_t kSecondInHour; class BlockCacheTraceHelper { public: - static bool ShouldTraceReferencedKey(TraceType block_type, - TableReaderCaller caller); - static bool ShouldTraceGetId(TableReaderCaller caller); + static bool IsGetOrMultiGetOnDataBlock(TraceType block_type, + TableReaderCaller caller); + static bool IsGetOrMultiGet(TableReaderCaller caller); static bool IsUserAccess(TableReaderCaller caller); static const std::string kUnknownColumnFamilyName; @@ -53,8 +53,11 @@ class BlockCacheTraceHelper { // kUserApproximateSize). struct BlockCacheLookupContext { BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} - BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id) - : caller(_caller), get_id(_get_id) {} + BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id, + bool _get_from_user_specified_snapshot) + : caller(_caller), + get_id(_get_id), + get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {} const TableReaderCaller caller; // These are populated when we perform lookup/insert on block cache. The block // cache tracer uses these inforation when logging the block access at @@ -69,6 +72,8 @@ struct BlockCacheLookupContext { // how many blocks a Get/MultiGet request accesses. We can also measure the // impact of row cache vs block cache. uint64_t get_id = 0; + std::string referenced_key; + bool get_from_user_specified_snapshot = false; void FillLookupContext(bool _is_cache_hit, bool _no_insert, TraceType _block_type, uint64_t _block_size, @@ -100,23 +105,25 @@ struct BlockCacheTraceRecord { Boolean no_insert = Boolean::kFalse; // Required field for Get and MultiGet uint64_t get_id = BlockCacheTraceHelper::kReservedGetId; - // Required fields for data block and user Get/Multi-Get only. + Boolean get_from_user_specified_snapshot = Boolean::kFalse; std::string referenced_key; + // Required fields for data block and user Get/Multi-Get only. uint64_t referenced_data_size = 0; uint64_t num_keys_in_block = 0; Boolean referenced_key_exist_in_block = Boolean::kFalse; BlockCacheTraceRecord() {} - BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, - TraceType _block_type, uint64_t _block_size, - uint64_t _cf_id, std::string _cf_name, uint32_t _level, - uint64_t _sst_fd_number, TableReaderCaller _caller, - bool _is_cache_hit, bool _no_insert, uint64_t _get_id, - std::string _referenced_key = "", - uint64_t _referenced_data_size = 0, - uint64_t _num_keys_in_block = 0, - bool _referenced_key_exist_in_block = false) + BlockCacheTraceRecord( + uint64_t _access_timestamp, std::string _block_key, TraceType _block_type, + uint64_t _block_size, uint64_t _cf_id, std::string _cf_name, + uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller, + bool _is_cache_hit, bool _no_insert, + uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId, + bool _get_from_user_specified_snapshot = false, + std::string _referenced_key = "", uint64_t _referenced_data_size = 0, + uint64_t _num_keys_in_block = 0, + bool _referenced_key_exist_in_block = false) : access_timestamp(_access_timestamp), block_key(_block_key), block_type(_block_type), @@ -129,6 +136,9 @@ struct BlockCacheTraceRecord { is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse), no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse), get_id(_get_id), + get_from_user_specified_snapshot(_get_from_user_specified_snapshot + ? Boolean::kTrue + : Boolean::kFalse), referenced_key(_referenced_key), referenced_data_size(_referenced_data_size), num_keys_in_block(_num_keys_in_block), diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index aae513ad5d7..c9983aee190 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -74,6 +74,7 @@ class BlockCacheTracerTest : public testing::Test { // Provide get_id for all callers. The writer should only write get_id // when the caller is either GET or MGET. record.get_id = key_id + 1; + record.get_from_user_specified_snapshot = Boolean::kTrue; // Provide these fields for all block types. // The writer should only write these fields for data blocks and the // caller is either GET or MGET. @@ -126,20 +127,22 @@ class BlockCacheTracerTest : public testing::Test { if (record.caller == TableReaderCaller::kUserGet || record.caller == TableReaderCaller::kUserMultiGet) { ASSERT_EQ(key_id + 1, record.get_id); + ASSERT_EQ(Boolean::kTrue, record.get_from_user_specified_snapshot); + ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), + record.referenced_key); } else { ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id); + ASSERT_EQ(Boolean::kFalse, record.get_from_user_specified_snapshot); + ASSERT_EQ("", record.referenced_key); } if (block_type == TraceType::kBlockTraceDataBlock && (record.caller == TableReaderCaller::kUserGet || record.caller == TableReaderCaller::kUserMultiGet)) { - ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), - record.referenced_key); ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block); ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block); ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size); continue; } - ASSERT_EQ("", record.referenced_key); ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block); ASSERT_EQ(0, record.num_keys_in_block); ASSERT_EQ(0, record.referenced_data_size); diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index ebfc4cd0eb0..90433df11bf 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -110,19 +110,22 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { std::string HybridRowBlockCacheSimulator::ComputeRowKey( const BlockCacheTraceRecord& access) { assert(access.get_id != BlockCacheTraceHelper::kReservedGetId); - Slice key; - if (access.referenced_key_exist_in_block == Boolean::kTrue) { - key = ExtractUserKey(access.referenced_key); - } else { - key = access.referenced_key; - } - return std::to_string(access.sst_fd_number) + "_" + key.ToString(); + Slice key = ExtractUserKey(access.referenced_key); + uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse + ? 0 + : 1 + GetInternalKeySeqno(access.referenced_key); + return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" + + std::to_string(seq_no); } void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { bool is_cache_miss = true; bool admitted = true; - if (access.get_id != BlockCacheTraceHelper::kReservedGetId) { + // TODO (haoyu): We only support Get for now. We need to extend the tracing + // for MultiGet, i.e., non-data block accesses must log all keys in a + // MultiGet. + if (access.caller == TableReaderCaller::kUserGet && + access.get_id != BlockCacheTraceHelper::kReservedGetId) { // This is a Get/MultiGet request. const std::string& row_key = ComputeRowKey(access); if (getid_getkeys_map_[access.get_id].find(row_key) == diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h index b6667eeed12..82972688658 100644 --- a/utilities/simulator_cache/cache_simulator.h +++ b/utilities/simulator_cache/cache_simulator.h @@ -137,7 +137,6 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { private: // Row key is a concatenation of the access's fd_number and the referenced // user key. - // TODO(haoyu): the row key should contain sequence number. std::string ComputeRowKey(const BlockCacheTraceRecord& access); enum InsertResult : char { diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc index fb0c9e84976..f435785e6a1 100644 --- a/utilities/simulator_cache/cache_simulator_test.cc +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -174,10 +174,11 @@ TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) { TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { uint64_t block_id = 100; BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); + first_get.get_from_user_specified_snapshot = Boolean::kTrue; BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1); second_get.referenced_data_size = 0; second_get.referenced_key_exist_in_block = Boolean::kFalse; - second_get.referenced_key = kRefKeyPrefix + std::to_string(kGetId); + second_get.get_from_user_specified_snapshot = Boolean::kTrue; BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2); third_get.referenced_data_size = 0; third_get.referenced_key_exist_in_block = Boolean::kFalse; @@ -203,9 +204,10 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { ASSERT_EQ(100, cache_simulator->miss_ratio()); ASSERT_EQ(10, cache_simulator->user_accesses()); ASSERT_EQ(100, cache_simulator->user_miss_ratio()); - auto handle = - sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) + - "_" + first_get.referenced_key)); + auto handle = sim_cache->Lookup( + std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString() + "_" + + std::to_string(1 + GetInternalKeySeqno(first_get.referenced_key))); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); for (uint32_t i = 100; i < block_id; i++) { @@ -227,8 +229,10 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { ASSERT_EQ(66, static_cast(cache_simulator->miss_ratio())); ASSERT_EQ(15, cache_simulator->user_accesses()); ASSERT_EQ(66, static_cast(cache_simulator->user_miss_ratio())); - handle = sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" + - second_get.referenced_key); + handle = sim_cache->Lookup( + std::to_string(second_get.sst_fd_number) + "_" + + ExtractUserKey(second_get.referenced_key).ToString() + "_" + + std::to_string(1 + GetInternalKeySeqno(second_get.referenced_key))); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); for (uint32_t i = 100; i < block_id; i++) { @@ -283,9 +287,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) { cache_simulator->Access(first_get); block_id++; } - auto handle = - sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) + - "_" + first_get.referenced_key)); + auto handle = sim_cache->Lookup( + std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString() + "_0"); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); // All blocks are missing from the cache since insert_blocks_row_kvpair_misses From 9f5cfb8e7142fe7b8fe4668aefd481e881f5bb42 Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Wed, 17 Jul 2019 17:01:30 -0700 Subject: [PATCH 232/572] Fix for ReadaheadSequentialFile crash in ldb_cmd_test (#5586) Summary: Fixing a corner case crash when there was no data read from file, but status is still OK Pull Request resolved: https://github.com/facebook/rocksdb/pull/5586 Differential Revision: D16348117 Pulled By: elipoz fbshipit-source-id: f97973308024f020d8be79ca3c56466b84d80656 --- util/file_reader_writer.cc | 9 +++++++-- util/file_reader_writer_test.cc | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 9175fa502f9..b6a5eefcfdb 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -738,7 +738,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { if (s.ok()) { buffer_offset_ = offset; buffer_.Size(result.size()); - assert(buffer_.BufferStart() == result.data()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); } return s; } @@ -886,7 +886,7 @@ class ReadaheadSequentialFile : public SequentialFile { if (s.ok()) { buffer_offset_ = read_offset_; buffer_.Size(result.size()); - assert(buffer_.BufferStart() == result.data()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); } return s; } @@ -1027,6 +1027,11 @@ std::unique_ptr NewReadaheadRandomAccessFile( std::unique_ptr SequentialFileReader::NewReadaheadSequentialFile( std::unique_ptr&& file, size_t readahead_size) { + if (file->GetRequiredBufferAlignment() >= readahead_size) { + // Short-circuit and return the original file if readahead_size is + // too small and hence doesn't make sense to be used for prefetching. + return std::move(file); + } std::unique_ptr result( new ReadaheadSequentialFile(std::move(file), readahead_size)); return result; diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index aa74303b8fc..1b86f798f7f 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -325,7 +325,7 @@ class ReadaheadSequentialFileTest : public testing::Test, public testing::WithParamInterface { public: static std::vector GetReadaheadSizeList() { - return {1lu << 12, 1lu << 16}; + return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18}; } void SetUp() override { readahead_size_ = GetParam(); From ec2b996b29ab45d7d33a124f499344c8fb054229 Mon Sep 17 00:00:00 2001 From: anand76 Date: Wed, 17 Jul 2019 22:02:49 -0700 Subject: [PATCH 233/572] Fix LITE mode build failure Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5588 Test Plan: make LITE=1 all check Differential Revision: D16354543 Pulled By: anand1976 fbshipit-source-id: 327a171439e183ac3a5e5057c511d6bca445e97d --- db/import_column_family_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index a93ecbf1173..76a8b90fadd 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -555,7 +555,7 @@ int main(int argc, char** argv) { #else #include -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { fprintf(stderr, "SKIPPED as External SST File Writer and Import are not supported " "in ROCKSDB_LITE\n"); From 3a6e83b56bbbebbd351c6666b31398be960c135d Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 18 Jul 2019 10:13:05 -0700 Subject: [PATCH 234/572] HISTORY update for export and import column family APIs Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5587 Differential Revision: D16359919 fbshipit-source-id: cfd9c448d79a8b8e7ac1d2b661d10151df269dba --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index 2e1e03f68de..b9d0f741317 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ * db_bench adds a "benchmark" stats_history, which prints out the whole stats history. * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path * Overload GetAllKeyVersions() to support non-default column family. +* Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. From abd1fdddef8c72a3ffa736284c03ea550ace211b Mon Sep 17 00:00:00 2001 From: anand76 Date: Thu, 18 Jul 2019 14:38:23 -0700 Subject: [PATCH 235/572] Fix asan_check failures Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5589 Test Plan: TEST_TMPDIR=/dev/shm/rocksdb COMPILE_WITH_ASAN=1 OPT=-g make J=64 -j64 asan_check Differential Revision: D16361081 Pulled By: anand1976 fbshipit-source-id: 09474832b9cfb318a840d4b633e22dfad105d58c --- db/import_column_family_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 76a8b90fadd..bc239c699ba 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -298,6 +298,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; ImportColumnFamilyOptions import_options; import_options.move_files = false; @@ -407,6 +408,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; // Create a new db and import the files. DB* db_copy; @@ -424,6 +426,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { ASSERT_EQ(Get(1, Key(i)), value); } db_copy->DropColumnFamily(cfh); + db_copy->DestroyColumnFamilyHandle(cfh); test::DestroyDir(env_, dbname_ + "/db_copy"); } From 6bb3b4b567452ff88b6023d3db61bba2e4125d6c Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 19 Jul 2019 11:31:52 -0700 Subject: [PATCH 236/572] ldb idump to support non-default column families. (#5594) Summary: ldb idump now only works for default column family. Extend it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5594 Test Plan: Compile and run the tool against a multiple CF DB. Differential Revision: D16380684 fbshipit-source-id: bfb8af36fdad1806837c90aaaab492d71528aceb --- tools/ldb_cmd.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 8f4258cf36e..22b2399a278 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -1301,7 +1301,8 @@ void InternalDumpCommand::DoCommand() { // Cast as DBImpl to get internal iterator std::vector key_versions; - Status st = GetAllKeyVersions(db_, from_, to_, max_keys_, &key_versions); + Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_, + &key_versions); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); return; From c129c75fb7810959a3da548d03bd3cededcb0a8f Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Fri, 19 Jul 2019 11:54:38 -0700 Subject: [PATCH 237/572] Added log_readahead_size option to control prefetching for Log::Reader (#5592) Summary: Added log_readahead_size option to control prefetching for Log::Reader. This is mostly useful for reading a remotely located log, as it can save the number of round-trips when reading it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5592 Differential Revision: D16362989 Pulled By: elipoz fbshipit-source-id: c5d4d5245a44008cd59879640efff70c091ad3e8 --- db/db_impl/db_impl_open.cc | 3 ++- db/db_impl/db_impl_secondary.cc | 3 ++- db/version_set.cc | 9 ++++++--- include/rocksdb/env.h | 4 ++-- include/rocksdb/options.h | 11 +++++++++++ options/db_options.cc | 6 +++++- options/db_options.h | 1 + options/options_helper.cc | 5 ++++- options/options_settable_test.cc | 3 ++- 9 files changed, 35 insertions(+), 10 deletions(-) diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 82e61a260b8..0e0fcfbf2c3 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -721,7 +721,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, continue; } } - file_reader.reset(new SequentialFileReader(std::move(file), fname)); + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); } // Create the log reader. diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index e14e53e55c3..a73cd6ba296 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -150,7 +150,8 @@ Status DBImplSecondary::MaybeInitLogReader( *log_reader = nullptr; return status; } - file_reader.reset(new SequentialFileReader(std::move(file), fname)); + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); } // Create the log reader. diff --git a/db/version_set.cc b/db/version_set.cc index 0d3b9fb4e32..559a4190f16 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4267,7 +4267,8 @@ Status VersionSet::Recover( return s; } manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path)); + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); } uint64_t current_manifest_file_size; s = env_->GetFileSize(manifest_path, ¤t_manifest_file_size); @@ -4597,7 +4598,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, if (!s.ok()) { return s; } - file_reader.reset(new SequentialFileReader(std::move(file), dscname)); + file_reader.reset(new SequentialFileReader( + std::move(file), dscname, db_options_->log_readahead_size)); } bool have_prev_log_number = false; @@ -5721,7 +5723,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( std::unique_ptr manifest_file_reader; if (s.ok()) { manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path)); + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); manifest_reader->reset(new log::FragmentBufferedReader( nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, 0 /* log_number */)); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 67464cc5c55..126f25747ff 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -118,10 +118,10 @@ struct EnvOptions { bool fallocate_with_keep_size = true; // See DBOptions doc - size_t compaction_readahead_size; + size_t compaction_readahead_size = 0; // See DBOptions doc - size_t random_access_max_buffer_size; + size_t random_access_max_buffer_size = 0; // See DBOptions doc size_t writable_file_max_buffer_size = 1024 * 1024; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 09dc8e54c5c..234af6a31eb 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1087,6 +1087,17 @@ struct DBOptions { // If set to true, takes precedence over // ReadOptions::background_purge_on_iterator_cleanup. bool avoid_unnecessary_blocking_io = false; + + // The number of bytes to prefetch when reading the log. This is mostly useful + // for reading a remotely located log, as it can save the number of + // round-trips. If 0, then the prefetching is disabled. + + // If non-zero, we perform bigger reads when reading the log. + // This is mostly useful for reading a remotely located log, as it can save + // the number of round-trips. If 0, then the prefetching is disabled. + // + // Default: 0 + size_t log_readahead_size = 0; }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/options/db_options.cc b/options/db_options.cc index 490a3708030..3756c555ceb 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -85,7 +85,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) manual_wal_flush(options.manual_wal_flush), atomic_flush(options.atomic_flush), avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), - persist_stats_to_disk(options.persist_stats_to_disk) { + persist_stats_to_disk(options.persist_stats_to_disk), + log_readahead_size(options.log_readahead_size) { } void ImmutableDBOptions::Dump(Logger* log) const { @@ -225,6 +226,9 @@ void ImmutableDBOptions::Dump(Logger* log) const { avoid_unnecessary_blocking_io); ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", persist_stats_to_disk); + ROCKS_LOG_HEADER( + log, " Options.log_readahead_size: %" ROCKSDB_PRIszt, + log_readahead_size); } MutableDBOptions::MutableDBOptions() diff --git a/options/db_options.h b/options/db_options.h index 92eea4ecfa1..e39e2903ff3 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -82,6 +82,7 @@ struct ImmutableDBOptions { bool atomic_flush; bool avoid_unnecessary_blocking_io; bool persist_stats_to_disk; + size_t log_readahead_size; }; struct MutableDBOptions { diff --git a/options/options_helper.cc b/options/options_helper.cc index 47aba7ad035..922ece3a81a 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -138,7 +138,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.atomic_flush = immutable_db_options.atomic_flush; options.avoid_unnecessary_blocking_io = immutable_db_options.avoid_unnecessary_blocking_io; - + options.log_readahead_size = immutable_db_options.log_readahead_size; return options; } @@ -1664,6 +1664,9 @@ std::unordered_map {offsetof(struct DBOptions, avoid_unnecessary_blocking_io), OptionType::kBoolean, OptionVerificationType::kNormal, false, offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}}, + {"log_readahead_size", + {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT, + OptionVerificationType::kNormal, false, 0}}, }; std::unordered_map diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index f0b79e372f7..e60fd6f9ebf 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -295,7 +295,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "manual_wal_flush=false;" "seq_per_batch=false;" "atomic_flush=false;" - "avoid_unnecessary_blocking_io=false", + "avoid_unnecessary_blocking_io=false;" + "log_readahead_size=0", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), From 4f7ba3aaed08b0f29a2828a44fb9eed525f47610 Mon Sep 17 00:00:00 2001 From: anand76 Date: Fri, 19 Jul 2019 13:20:45 -0700 Subject: [PATCH 238/572] Fix tsan and valgrind failures in import_column_family_test Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5598 Test Plan: tsan_check valgrind_test Differential Revision: D16380167 Pulled By: anand1976 fbshipit-source-id: 2d0caea7d2d02a9606457f62811175d762b89d5c --- db/import_column_family_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index bc239c699ba..4f695d33f90 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -427,6 +427,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { } db_copy->DropColumnFamily(cfh); db_copy->DestroyColumnFamilyHandle(cfh); + delete db_copy; test::DestroyDir(env_, dbname_ + "/db_copy"); } From 0be1feec216cfdbc1c8feab95c88dad2eefab3df Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Fri, 19 Jul 2019 14:55:07 -0700 Subject: [PATCH 239/572] Added .watchmanconfig file to rocksdb repo (#5593) Summary: Added .watchmanconfig file to rocksdb repo. It is currently .gitignored. This allows to auto sync modified files with watchman when editing them remotely. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5593 Differential Revision: D16363860 Pulled By: elipoz fbshipit-source-id: 5ae221e21c6c757ceb08877771550d508f773d55 --- .gitignore | 1 + .watchmanconfig | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 .watchmanconfig diff --git a/.gitignore b/.gitignore index 180fb4c5007..7a799c09a9d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ make_config.mk *.vcxproj.filters *.sln *.cmake +.watchmanconfig CMakeCache.txt CMakeFiles/ build/ diff --git a/.watchmanconfig b/.watchmanconfig new file mode 100644 index 00000000000..e5b450d7bbb --- /dev/null +++ b/.watchmanconfig @@ -0,0 +1,6 @@ +{ + "content_hash_warming": true, + "content_hash_max_items": 333333, + "hint_num_files_per_dir": 8, + "fsevents_latency": 0.05 +} From a78503bd6c80a3c4137df1962a972fe406b4d90b Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 22 Jul 2019 14:35:03 -0700 Subject: [PATCH 240/572] Temporarily disable snapshot list refresh for atomic flush stress test (#5581) Summary: Atomic flush test started to fail after https://github.com/facebook/rocksdb/issues/5099. Then https://github.com/facebook/rocksdb/issues/5278 provided a fix after which the same error occurred much less frequently. However it still occur occasionally. Not sure what the root cause is. This PR disables the feature of snapshot list refresh, and we should keep an eye on the failure in the future. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5581 Differential Revision: D16295985 Pulled By: riversand963 fbshipit-source-id: c9e62e65133c52c21b07097de359632ca62571e4 --- tools/db_crashtest.py | 1 + tools/db_stress.cc | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 2a38d4c96d9..709406e56f4 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -141,6 +141,7 @@ def is_direct_io_supported(dbname): "write_buffer_size": 1024 * 1024, # disable pipelined write when test_atomic_flush is true "enable_pipelined_write": 0, + "snap_refresh_nanos": 0, } diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 3f767a9e76a..98d088e345e 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -512,6 +512,10 @@ DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); +DEFINE_uint64( + snap_refresh_nanos, 100 * 1000 * 1000, + "If non-zero, compactions will periodically refresh snapshot list."); + namespace { enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -2724,6 +2728,8 @@ class StressTest { fprintf(stdout, " %s\n", p.c_str()); } } + fprintf(stdout, "Snapshot refresh nanos : %" PRIu64 "\n", + FLAGS_snap_refresh_nanos); fprintf(stdout, "------------------------------------------------\n"); } @@ -2873,6 +2879,7 @@ class StressTest { } else { options_.merge_operator = MergeOperators::CreatePutOperator(); } + options_.snap_refresh_nanos = FLAGS_snap_refresh_nanos; fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); From 3778470061c77f773fab1e433c2ecad7ff02f293 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 22 Jul 2019 17:47:54 -0700 Subject: [PATCH 241/572] Block cache analyzer: Compute correlation of features and human readable trace file. (#5596) Summary: - Compute correlation between a few features and predictions, e.g., number of accesses since the last access vs number of accesses till the next access on a block. - Output human readable trace file so python can consume it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5596 Test Plan: make clean && USE_CLANG=1 make check -j32 Differential Revision: D16373200 Pulled By: HaoyuHuang fbshipit-source-id: c848d26bc2e9210461f317d7dbee42d55be5a0cc --- tools/block_cache_trace_analyzer.cc | 475 ++++++++++++++++-- tools/block_cache_trace_analyzer.h | 95 +++- tools/block_cache_trace_analyzer_test.cc | 76 ++- trace_replay/block_cache_tracer.cc | 14 + trace_replay/block_cache_tracer.h | 5 +- utilities/simulator_cache/cache_simulator.cc | 66 +-- utilities/simulator_cache/cache_simulator.h | 85 ++-- .../simulator_cache/cache_simulator_test.cc | 79 +-- 8 files changed, 753 insertions(+), 142 deletions(-) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 76633846257..08143ebcf88 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -7,11 +7,16 @@ #ifdef GFLAGS #include "tools/block_cache_trace_analyzer.h" +#include #include +#include +#include #include #include #include +#include #include + #include "monitoring/histogram.h" #include "util/gflags_compat.h" #include "util/string_util.h" @@ -122,6 +127,20 @@ DEFINE_string(analyze_get_spatial_locality_labels, "", "Group data blocks using these labels."); DEFINE_string(analyze_get_spatial_locality_buckets, "", "Group data blocks by their statistics using these buckets."); +DEFINE_bool(mrc_only, false, + "Evaluate alternative cache policies only. When this flag is true, " + "the analyzer does NOT maintain states of each block in memory for " + "analysis. It only feeds the accesses into the cache simulators."); +DEFINE_string( + analyze_correlation_coefficients_labels, "", + "Analyze the correlation coefficients of features such as number of past " + "accesses with regard to the number of accesses till the next access."); +DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000, + "The maximum number of values for a feature. If the number of " + "values for a feature is larger than this max, it randomly " + "selects 'max' number of values."); +DEFINE_string(human_readable_trace_file_path, "", + "The filt path that saves human readable access records."); namespace rocksdb { namespace { @@ -143,7 +162,10 @@ const std::string kSupportedCacheNames = "ghost_lru_hybrid_no_insert_on_row_miss "; // The suffix for the generated csv files. +const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline"; +const std::string kFileNameSuffixMissTimeline = "miss_timeline"; const std::string kFileNameSuffixAccessTimeline = "access_timeline"; +const std::string kFileNameSuffixCorrelation = "correlation_input"; const std::string kFileNameSuffixAvgReuseIntervalNaccesses = "avg_reuse_interval_naccesses"; const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval"; @@ -279,6 +301,18 @@ double percent(uint64_t numerator, uint64_t denomenator) { return static_cast(numerator * 100.0 / denomenator); } +std::map adjust_time_unit( + const std::map& time_stats, uint64_t time_unit) { + if (time_unit == 1) { + return time_stats; + } + std::map adjusted_time_stats; + for (auto const& time : time_stats) { + adjusted_time_stats[static_cast(time.first / time_unit)] += + time.second; + } + return adjusted_time_stats; +} } // namespace void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { @@ -288,8 +322,12 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { if (output_dir_.empty()) { return; } + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + uint64_t total_accesses = access_sequence_number_; const std::string output_miss_ratio_curve_path = - output_dir_ + "/" + kMissRatioCurveFileName; + output_dir_ + "/" + std::to_string(trace_duration) + "_" + + std::to_string(total_accesses) + "_" + kMissRatioCurveFileName; std::ofstream out(output_miss_ratio_curve_path); if (!out.is_open()) { return; @@ -302,7 +340,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { for (auto const& config_caches : cache_simulator_->sim_caches()) { const CacheConfiguration& config = config_caches.first; for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { - double miss_ratio = config_caches.second[i]->miss_ratio(); + double miss_ratio = + config_caches.second[i]->miss_ratio_stats().miss_ratio(); // Write the body. out << config.cache_name; out << ","; @@ -314,13 +353,287 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { out << ","; out << std::fixed << std::setprecision(4) << miss_ratio; out << ","; - out << config_caches.second[i]->total_accesses(); + out << config_caches.second[i]->miss_ratio_stats().total_accesses(); out << std::endl; } } out.close(); } +void BlockCacheTraceAnalyzer::UpdateFeatureVectors( + const std::vector& access_sequence_number_timeline, + const std::vector& access_timeline, const std::string& label, + std::map* label_features, + std::map* label_predictions) const { + if (access_sequence_number_timeline.empty() || access_timeline.empty()) { + return; + } + assert(access_timeline.size() == access_sequence_number_timeline.size()); + uint64_t prev_access_sequence_number = access_sequence_number_timeline[0]; + uint64_t prev_access_timestamp = access_timeline[0]; + for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) { + uint64_t num_accesses_since_last_access = + access_sequence_number_timeline[i] - prev_access_sequence_number; + uint64_t elapsed_time_since_last_access = + access_timeline[i] - prev_access_timestamp; + prev_access_sequence_number = access_sequence_number_timeline[i]; + prev_access_timestamp = access_timeline[i]; + if (i < access_sequence_number_timeline.size() - 1) { + (*label_features)[label].num_accesses_since_last_access.push_back( + num_accesses_since_last_access); + (*label_features)[label].num_past_accesses.push_back(i); + (*label_features)[label].elapsed_time_since_last_access.push_back( + elapsed_time_since_last_access); + } + if (i >= 1) { + (*label_predictions)[label].num_accesses_till_next_access.push_back( + num_accesses_since_last_access); + (*label_predictions)[label].elapsed_time_till_next_access.push_back( + elapsed_time_since_last_access); + } + } +} + +void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const { + if (!cache_simulator_ || output_dir_.empty()) { + return; + } + std::map>> + cs_name_timeline; + uint64_t start_time = port::kMaxUint64; + uint64_t end_time = 0; + const std::map& trace_num_misses = + adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit); + const std::map& trace_num_accesses = + adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit); + assert(trace_num_misses.size() == trace_num_accesses.size()); + for (auto const& num_miss : trace_num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + auto it = trace_num_accesses.find(time); + assert(it != trace_num_accesses.end()); + uint64_t access = it->second; + cs_name_timeline[port::kMaxUint64]["trace"][time] = percent(miss, access); + } + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + std::string cache_label = config.cache_name + "-" + + std::to_string(config.num_shard_bits) + "-" + + std::to_string(config.ghost_cache_capacity); + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + const std::map& num_misses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_misses_timeline(), + time_unit); + const std::map& num_accesses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(), + time_unit); + assert(num_misses.size() == num_accesses.size()); + for (auto const& num_miss : num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + auto it = num_accesses.find(time); + assert(it != num_accesses.end()); + uint64_t access = it->second; + cs_name_timeline[config.cache_capacities[i]][cache_label][time] = + percent(miss, access); + } + } + } + for (auto const& it : cs_name_timeline) { + const std::string output_miss_ratio_timeline_path = + output_dir_ + "/" + std::to_string(it.first) + "_" + + std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline; + std::ofstream out(output_miss_ratio_timeline_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + for (auto const& label : it.second) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { + auto misses = label.second.find(now); + row += ","; + if (misses != label.second.end()) { + row += std::to_string(misses->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const { + if (!cache_simulator_ || output_dir_.empty()) { + return; + } + std::map>> + cs_name_timeline; + uint64_t start_time = port::kMaxUint64; + uint64_t end_time = 0; + const std::map& trace_num_misses = + adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit); + for (auto const& num_miss : trace_num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + cs_name_timeline[port::kMaxUint64]["trace"][time] = miss; + } + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + std::string cache_label = config.cache_name + "-" + + std::to_string(config.num_shard_bits) + "-" + + std::to_string(config.ghost_cache_capacity); + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + const std::map& num_misses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_misses_timeline(), + time_unit); + for (auto const& num_miss : num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss; + } + } + } + for (auto const& it : cs_name_timeline) { + const std::string output_miss_ratio_timeline_path = + output_dir_ + "/" + std::to_string(it.first) + "_" + + std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline; + std::ofstream out(output_miss_ratio_timeline_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + for (auto const& label : it.second) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { + auto misses = label.second.find(now); + row += ","; + if (misses != label.second.end()) { + row += std::to_string(misses->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeatures( + const std::string& label_str, uint32_t max_number_of_values) const { + std::set labels = ParseLabelStr(label_str); + std::map label_features; + std::map label_predictions; + auto block_callback = + [&](const std::string& cf_name, uint64_t fd, uint32_t level, + TraceType block_type, const std::string& /*block_key*/, + uint64_t /*block_key_id*/, const BlockAccessInfo& block) { + if (labels.find(kGroupbyCaller) != labels.end()) { + // Group by caller. + for (auto const& caller_map : block.caller_access_timeline) { + const std::string label = + BuildLabel(labels, cf_name, fd, level, block_type, + caller_map.first, /*block_id=*/0); + auto it = block.caller_access_sequence__number_timeline.find( + caller_map.first); + assert(it != block.caller_access_sequence__number_timeline.end()); + UpdateFeatureVectors(it->second, caller_map.second, label, + &label_features, &label_predictions); + } + return; + } + const std::string label = BuildLabel( + labels, cf_name, fd, level, block_type, + TableReaderCaller::kMaxBlockCacheLookupCaller, /*block_id=*/0); + UpdateFeatureVectors(block.access_sequence_number_timeline, + block.access_timeline, label, &label_features, + &label_predictions); + }; + TraverseBlocks(block_callback); + WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions, + max_number_of_values); +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile( + const std::string& label, + const std::map& label_features, + const std::map& label_predictions, + uint32_t max_number_of_values) const { + std::default_random_engine rand_engine(env_->NowMicros()); + for (auto const& label_feature_vectors : label_features) { + const Features& past = label_feature_vectors.second; + auto it = label_predictions.find(label_feature_vectors.first); + assert(it != label_predictions.end()); + const Predictions& future = it->second; + const std::string output_path = output_dir_ + "/" + label + "_" + + label_feature_vectors.first + "_" + + kFileNameSuffixCorrelation; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header( + "num_accesses_since_last_access,elapsed_time_since_last_access,num_" + "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_" + "access"); + out << header << std::endl; + std::vector indexes; + for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) { + indexes.push_back(i); + } + std::shuffle(indexes.begin(), indexes.end(), rand_engine); + for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) { + uint32_t rand_index = indexes[i]; + out << std::to_string(past.num_accesses_since_last_access[rand_index]) + << ","; + out << std::to_string(past.elapsed_time_since_last_access[rand_index]) + << ","; + out << std::to_string(past.num_past_accesses[rand_index]) << ","; + out << std::to_string(future.num_accesses_till_next_access[rand_index]) + << ","; + out << std::to_string(future.elapsed_time_till_next_access[rand_index]) + << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet( + uint32_t max_number_of_values) const { + std::string label = "GetKeyInfo"; + std::map label_features; + std::map label_predictions; + for (auto const& get_info : get_key_info_map_) { + const GetKeyInfo& info = get_info.second; + UpdateFeatureVectors(info.access_sequence_number_timeline, + info.access_timeline, label, &label_features, + &label_predictions); + } + WriteCorrelationFeaturesToFile(label, label_features, label_predictions, + max_number_of_values); +} + std::set BlockCacheTraceAnalyzer::ParseLabelStr( const std::string& label_str) const { std::stringstream ss(label_str); @@ -371,7 +684,6 @@ void BlockCacheTraceAnalyzer::TraverseBlocks( uint64_t /*block_key_id*/, const BlockAccessInfo& /*block_access_info*/)> block_callback) const { - uint64_t block_id = 0; for (auto const& cf_aggregates : cf_aggregates_map_) { // Stats per column family. const std::string& cf_name = cf_aggregates.first; @@ -387,8 +699,8 @@ void BlockCacheTraceAnalyzer::TraverseBlocks( block_type_aggregates.second.block_access_info_map) { // Stats per block. block_callback(cf_name, fd, level, type, block_access_info.first, - block_id, block_access_info.second); - block_id++; + block_access_info.second.block_id, + block_access_info.second); } } } @@ -1046,12 +1358,15 @@ void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats( BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( const std::string& trace_file_path, const std::string& output_dir, - bool compute_reuse_distance, + const std::string& human_readable_trace_file_path, + bool compute_reuse_distance, bool mrc_only, std::unique_ptr&& cache_simulator) : env_(rocksdb::Env::Default()), trace_file_path_(trace_file_path), output_dir_(output_dir), + human_readable_trace_file_path_(human_readable_trace_file_path), compute_reuse_distance_(compute_reuse_distance), + mrc_only_(mrc_only), cache_simulator_(std::move(cache_simulator)) {} void BlockCacheTraceAnalyzer::ComputeReuseDistance( @@ -1072,7 +1387,29 @@ void BlockCacheTraceAnalyzer::ComputeReuseDistance( info->unique_blocks_since_last_access.clear(); } -void BlockCacheTraceAnalyzer::RecordAccess( +Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord( + const BlockCacheTraceRecord& access, uint64_t block_id, + uint64_t get_key_id) { + if (!human_readable_trace_file_writer_) { + return Status::OK(); + } + int ret = snprintf( + trace_record_buffer_, sizeof(trace_record_buffer_), + "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%" PRIu32 ",%" PRIu64 + "" + ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n", + access.access_timestamp, block_id, access.block_type, access.block_size, + access.cf_id, access.level, access.sst_fd_number, access.caller, + access.no_insert, access.get_id, get_key_id, access.referenced_data_size, + access.is_cache_hit); + if (ret < 0) { + return Status::IOError("failed to format the output"); + } + std::string printout(trace_record_buffer_); + return human_readable_trace_file_writer_->Append(printout); +} + +Status BlockCacheTraceAnalyzer::RecordAccess( const BlockCacheTraceRecord& access) { ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name]; SSTFileAccessInfoAggregate& file_aggr = @@ -1080,18 +1417,30 @@ void BlockCacheTraceAnalyzer::RecordAccess( file_aggr.level = access.level; BlockTypeAccessInfoAggregate& block_type_aggr = file_aggr.block_type_aggregates_map[access.block_type]; + if (block_type_aggr.block_access_info_map.find(access.block_key) == + block_type_aggr.block_access_info_map.end()) { + block_type_aggr.block_access_info_map[access.block_key].block_id = + unique_block_id_; + unique_block_id_++; + } BlockAccessInfo& block_access_info = block_type_aggr.block_access_info_map[access.block_key]; if (compute_reuse_distance_) { ComputeReuseDistance(&block_access_info); } - block_access_info.AddAccess(access); + block_access_info.AddAccess(access, access_sequence_number_); block_info_map_[access.block_key] = &block_access_info; - if (trace_start_timestamp_in_seconds_ == 0) { - trace_start_timestamp_in_seconds_ = - access.access_timestamp / kMicrosInSecond; + uint64_t get_key_id = 0; + if (access.caller == TableReaderCaller::kUserGet && + access.get_id != BlockCacheTraceHelper::kReservedGetId) { + std::string row_key = BlockCacheTraceHelper::ComputeRowKey(access); + if (get_key_info_map_.find(row_key) == get_key_info_map_.end()) { + get_key_info_map_[row_key].key_id = unique_get_key_id_; + get_key_id = unique_get_key_id_; + unique_get_key_id_++; + } + get_key_info_map_[row_key].AddAccess(access, access_sequence_number_); } - trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond; if (compute_reuse_distance_) { // Add this block to all existing blocks. @@ -1108,6 +1457,8 @@ void BlockCacheTraceAnalyzer::RecordAccess( } } } + return WriteHumanReadableTraceRecord(access, block_access_info.block_id, + get_key_id); } Status BlockCacheTraceAnalyzer::Analyze() { @@ -1122,32 +1473,68 @@ Status BlockCacheTraceAnalyzer::Analyze() { if (!s.ok()) { return s; } + if (!human_readable_trace_file_path_.empty()) { + s = env_->NewWritableFile(human_readable_trace_file_path_, + &human_readable_trace_file_writer_, EnvOptions()); + if (!s.ok()) { + return s; + } + } uint64_t start = env_->NowMicros(); - uint64_t processed_records = 0; uint64_t time_interval = 0; while (s.ok()) { BlockCacheTraceRecord access; s = reader.ReadAccess(&access); if (!s.ok()) { - return s; + break; + } + if (!mrc_only_) { + s = RecordAccess(access); + if (!s.ok()) { + break; + } } - RecordAccess(access); + if (trace_start_timestamp_in_seconds_ == 0) { + trace_start_timestamp_in_seconds_ = + access.access_timestamp / kMicrosInSecond; + } + trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond; + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + is_user_access(access.caller), + access.is_cache_hit == Boolean::kFalse); if (cache_simulator_) { cache_simulator_->Access(access); } - processed_records++; + access_sequence_number_++; uint64_t now = env_->NowMicros(); uint64_t duration = (now - start) / kMicrosInSecond; if (duration > 10 * time_interval) { + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; fprintf(stdout, "Running for %" PRIu64 " seconds: Processed %" PRIu64 - " records/second\n", - duration, processed_records / duration); - processed_records = 0; + " records/second. Trace duration %" PRIu64 + " seconds. Observed miss ratio %.2f\n", + duration, duration > 0 ? access_sequence_number_ / duration : 0, + trace_duration, miss_ratio_stats_.miss_ratio()); time_interval++; } } - return Status::OK(); + if (human_readable_trace_file_writer_) { + human_readable_trace_file_writer_->Flush(); + human_readable_trace_file_writer_->Close(); + } + uint64_t now = env_->NowMicros(); + uint64_t duration = (now - start) / kMicrosInSecond; + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + fprintf(stdout, + "Running for %" PRIu64 " seconds: Processed %" PRIu64 + " records/second. Trace duration %" PRIu64 + " seconds. Observed miss ratio %.2f\n", + duration, duration > 0 ? access_sequence_number_ / duration : 0, + trace_duration, miss_ratio_stats_.miss_ratio()); + return s; } void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { @@ -1321,15 +1708,6 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only, "Top %" PRIu32 " access count blocks access_count=%" PRIu64 " %s\n", top_k, naccess_it->first, statistics.c_str()); - // if (block->referenced_data_size > block->block_size) { - // for (auto const& ref_key_it : block->key_num_access_map) { - // ParsedInternalKey internal_key; - // ParseInternalKey(ref_key_it.first, &internal_key); - // printf("######%lu %lu %d %s\n", block->referenced_data_size, - // block->block_size, internal_key.type, - // internal_key.user_key.ToString().c_str()); - // } - // } } } @@ -1696,16 +2074,32 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { exit(1); } } - BlockCacheTraceAnalyzer analyzer( - FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir, - !FLAGS_reuse_distance_labels.empty(), std::move(cache_simulator)); + BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, + FLAGS_block_cache_analysis_result_dir, + FLAGS_human_readable_trace_file_path, + !FLAGS_reuse_distance_labels.empty(), + FLAGS_mrc_only, std::move(cache_simulator)); Status s = analyzer.Analyze(); - if (!s.IsIncomplete()) { + if (!s.IsIncomplete() && !s.ok()) { // Read all traces. fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str()); exit(1); } fprintf(stdout, "Status: %s\n", s.ToString().c_str()); + analyzer.WriteMissRatioCurves(); + analyzer.WriteMissRatioTimeline(1); + analyzer.WriteMissRatioTimeline(kSecondInMinute); + analyzer.WriteMissRatioTimeline(kSecondInHour); + analyzer.WriteMissTimeline(1); + analyzer.WriteMissTimeline(kSecondInMinute); + analyzer.WriteMissTimeline(kSecondInHour); + + if (FLAGS_mrc_only) { + fprintf(stdout, + "Skipping the analysis statistics since the user wants to compute " + "MRC only"); + return 0; + } analyzer.PrintStatsSummary(); if (FLAGS_print_access_count_stats) { @@ -1727,7 +2121,6 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.PrintDataBlockAccessStats(); } print_break_lines(/*num_break_lines=*/3); - analyzer.WriteMissRatioCurves(); if (!FLAGS_timeline_labels.empty()) { std::stringstream ss(FLAGS_timeline_labels); @@ -1819,6 +2212,18 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.WriteGetSpatialLocality(label, buckets); } } + + if (!FLAGS_analyze_correlation_coefficients_labels.empty()) { + std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteCorrelationFeatures( + label, FLAGS_analyze_correlation_coefficients_max_number_of_values); + } + analyzer.WriteCorrelationFeaturesForGet( + FLAGS_analyze_correlation_coefficients_max_number_of_values); + } return 0; } diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 32a90342cb1..bc41ff468cc 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -16,8 +16,23 @@ #include "utilities/simulator_cache/cache_simulator.h" namespace rocksdb { + +// Statistics of a key refereneced by a Get. +struct GetKeyInfo { + uint64_t key_id = 0; + std::vector access_sequence_number_timeline; + std::vector access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { + access_sequence_number_timeline.push_back(access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + } +}; + // Statistics of a block. struct BlockAccessInfo { + uint64_t block_id = 0; uint64_t num_accesses = 0; uint64_t block_size = 0; uint64_t first_access_time = 0; @@ -39,7 +54,16 @@ struct BlockAccessInfo { // Number of reuses grouped by reuse distance. std::map reuse_distance_count; - void AddAccess(const BlockCacheTraceRecord& access) { + // The access sequence numbers of this block. + std::vector access_sequence_number_timeline; + std::map> + caller_access_sequence__number_timeline; + // The access timestamp in microseconds of this block. + std::vector access_timeline; + std::map> caller_access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { if (block_size != 0 && access.block_size != 0) { assert(block_size == access.block_size); } @@ -57,6 +81,12 @@ struct BlockAccessInfo { const uint64_t timestamp_in_seconds = access.access_timestamp / kMicrosInSecond; caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; + // Populate the feature vectors. + access_sequence_number_timeline.push_back(access_sequnce_number); + caller_access_sequence__number_timeline[access.caller].push_back( + access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + caller_access_timeline[access.caller].push_back(access.access_timestamp); if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type, access.caller)) { num_keys = access.num_keys_in_block; @@ -94,11 +124,23 @@ struct ColumnFamilyAccessInfoAggregate { std::map fd_aggregates_map; }; +struct Features { + std::vector elapsed_time_since_last_access; + std::vector num_accesses_since_last_access; + std::vector num_past_accesses; +}; + +struct Predictions { + std::vector elapsed_time_till_next_access; + std::vector num_accesses_till_next_access; +}; + class BlockCacheTraceAnalyzer { public: BlockCacheTraceAnalyzer( const std::string& trace_file_path, const std::string& output_dir, - bool compute_reuse_distance, + const std::string& human_readable_trace_file_path, + bool compute_reuse_distance, bool mrc_only, std::unique_ptr&& cache_simulator); ~BlockCacheTraceAnalyzer() = default; // No copy and move. @@ -184,6 +226,24 @@ class BlockCacheTraceAnalyzer { // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses". void WriteMissRatioCurves() const; + // Write miss ratio timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissRatioTimeline(uint64_t time_unit) const; + + // Write misses timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissTimeline(uint64_t time_unit) const; + // Write the access timeline into a csv file saved in 'output_dir'. // // The file is named "label_access_timeline".The file format is @@ -236,6 +296,11 @@ class BlockCacheTraceAnalyzer { const std::string& label_str, const std::vector& percent_buckets) const; + void WriteCorrelationFeatures(const std::string& label_str, + uint32_t max_number_of_values) const; + + void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const; + const std::map& TEST_cf_aggregates_map() const { return cf_aggregates_map_; @@ -251,7 +316,7 @@ class BlockCacheTraceAnalyzer { void ComputeReuseDistance(BlockAccessInfo* info) const; - void RecordAccess(const BlockCacheTraceRecord& access); + Status RecordAccess(const BlockCacheTraceRecord& access); void UpdateReuseIntervalStats( const std::string& label, const std::vector& time_buckets, @@ -278,17 +343,41 @@ class BlockCacheTraceAnalyzer { const BlockAccessInfo& /*block_access_info*/)> block_callback) const; + void UpdateFeatureVectors( + const std::vector& access_sequence_number_timeline, + const std::vector& access_timeline, const std::string& label, + std::map* label_features, + std::map* label_predictions) const; + + void WriteCorrelationFeaturesToFile( + const std::string& label, + const std::map& label_features, + const std::map& label_predictions, + uint32_t max_number_of_values) const; + + Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access, + uint64_t block_id, uint64_t get_key_id); + rocksdb::Env* env_; const std::string trace_file_path_; const std::string output_dir_; + std::string human_readable_trace_file_path_; const bool compute_reuse_distance_; + const bool mrc_only_; BlockCacheTraceHeader header_; std::unique_ptr cache_simulator_; std::map cf_aggregates_map_; std::map block_info_map_; + std::unordered_map get_key_info_map_; + uint64_t access_sequence_number_ = 0; uint64_t trace_start_timestamp_in_seconds_ = 0; uint64_t trace_end_timestamp_in_seconds_ = 0; + MissRatioStats miss_ratio_stats_; + uint64_t unique_block_id_ = 1; + uint64_t unique_get_key_id_ = 1; + char trace_record_buffer_[1024 * 1024]; + std::unique_ptr human_readable_trace_file_writer_; }; int block_cache_trace_analyzer_tool(int argc, char** argv); diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index 45ef99eee75..a028bf197c9 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -117,7 +117,8 @@ class BlockCacheTracerTest : public testing::Test { // Provide these fields for all block types. // The writer should only write these fields for data blocks and the // caller is either GET or MGET. - record.referenced_key = kRefKeyPrefix + std::to_string(key_id); + record.referenced_key = + kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0); record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; ASSERT_OK(writer->WriteBlockAccess( @@ -179,7 +180,8 @@ class BlockCacheTracerTest : public testing::Test { "-analyze_get_spatial_locality_labels=" + analyze_get_spatial_locality_labels_, "-analyze_get_spatial_locality_buckets=" + - analyze_get_spatial_locality_buckets_}; + analyze_get_spatial_locality_buckets_, + "-analyze_correlation_coefficients_labels=all"}; char arg_buffer[kArgBufferSize]; char* argv[kMaxArgCount]; int argc = 0; @@ -236,9 +238,9 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { RunBlockCacheTraceAnalyzer(); { // Validate the cache miss ratios. - const std::vector expected_capacities{1024, 1024 * 1024, - 1024 * 1024 * 1024}; - const std::string mrc_path = test_path_ + "/mrc"; + std::vector expected_capacities{1024, 1024 * 1024, + 1024 * 1024 * 1024}; + const std::string mrc_path = test_path_ + "/49_50_mrc"; std::ifstream infile(mrc_path); uint32_t config_index = 0; std::string line; @@ -266,6 +268,68 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { ASSERT_EQ(expected_capacities.size(), config_index); infile.close(); ASSERT_OK(env_->DeleteFile(mrc_path)); + + const std::vector time_units{"1", "60", "3600"}; + expected_capacities.push_back(port::kMaxUint64); + for (auto const& expected_capacity : expected_capacities) { + for (auto const& time_unit : time_units) { + const std::string miss_ratio_timeline_path = + test_path_ + "/" + std::to_string(expected_capacity) + "_" + + time_unit + "_miss_ratio_timeline"; + std::ifstream mrt_file(miss_ratio_timeline_path); + // Read header. + ASSERT_TRUE(getline(mrt_file, line)); + ASSERT_TRUE(getline(mrt_file, line)); + std::stringstream ss(line); + bool read_header = false; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + if (!read_header) { + if (expected_capacity == port::kMaxUint64) { + ASSERT_EQ("trace", substr); + } else { + ASSERT_EQ("lru-1-0", substr); + } + read_header = true; + continue; + } + ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr)); + } + ASSERT_FALSE(getline(mrt_file, line)); + mrt_file.close(); + ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path)); + } + for (auto const& time_unit : time_units) { + const std::string miss_timeline_path = + test_path_ + "/" + std::to_string(expected_capacity) + "_" + + time_unit + "_miss_timeline"; + std::ifstream mt_file(miss_timeline_path); + // Read header. + ASSERT_TRUE(getline(mt_file, line)); + ASSERT_TRUE(getline(mt_file, line)); + std::stringstream ss(line); + uint32_t num_misses = 0; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + if (num_misses == 0) { + if (expected_capacity == port::kMaxUint64) { + ASSERT_EQ("trace", substr); + } else { + ASSERT_EQ("lru-1-0", substr); + } + num_misses++; + continue; + } + num_misses += ParseInt(substr); + } + ASSERT_EQ(51, num_misses); + ASSERT_FALSE(getline(mt_file, line)); + mt_file.close(); + ASSERT_OK(env_->DeleteFile(miss_timeline_path)); + } + } } { // Validate the timeline csv files. @@ -543,7 +607,9 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { // Read blocks. BlockCacheTraceAnalyzer analyzer(trace_file_path_, /*output_miss_ratio_curve_path=*/"", + /*human_readable_trace_file_path=*/"", /*compute_reuse_distance=*/true, + /*mrc_only=*/false, /*simulator=*/nullptr); // The analyzer ends when it detects an incomplete access record. ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 4f320ef2d0f..1eeb64ac85d 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -6,6 +6,7 @@ #include "trace_replay/block_cache_tracer.h" #include "db/db_impl/db_impl.h" +#include "db/dbformat.h" #include "rocksdb/slice.h" #include "util/coding.h" #include "util/hash.h" @@ -54,6 +55,19 @@ bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) { caller == TableReaderCaller::kUserVerifyChecksum; } +std::string BlockCacheTraceHelper::ComputeRowKey( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller)) { + return ""; + } + Slice key = ExtractUserKey(access.referenced_key); + uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse + ? 0 + : 1 + GetInternalKeySeqno(access.referenced_key); + return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" + + std::to_string(seq_no); +} + BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, std::unique_ptr&& trace_writer) diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index b1a258843e5..3863ca430a4 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -20,6 +20,7 @@ extern const uint64_t kMicrosInSecond; extern const uint64_t kSecondInMinute; extern const uint64_t kSecondInHour; +struct BlockCacheTraceRecord; class BlockCacheTraceHelper { public: @@ -27,7 +28,9 @@ class BlockCacheTraceHelper { TableReaderCaller caller); static bool IsGetOrMultiGet(TableReaderCaller caller); static bool IsUserAccess(TableReaderCaller caller); - + // Row key is a concatenation of the access's fd_number and the referenced + // user key. + static std::string ComputeRowKey(const BlockCacheTraceRecord& access); static const std::string kUnknownColumnFamilyName; static const uint64_t kReservedGetId; }; diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index 90433df11bf..06de4c11996 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -4,13 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #include "utilities/simulator_cache/cache_simulator.h" +#include #include "db/dbformat.h" namespace rocksdb { namespace { const std::string kGhostCachePrefix = "ghost_"; -} +} // namespace GhostCache::GhostCache(std::shared_ptr sim_cache) : sim_cache_(sim_cache) {} @@ -22,7 +23,7 @@ bool GhostCache::Admit(const Slice& lookup_key) { return true; } sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(), - /*deleter=*/nullptr, /*handle=*/nullptr); + /*deleter=*/nullptr); return false; } @@ -43,18 +44,27 @@ void CacheSimulator::Access(const BlockCacheTraceRecord& access) { sim_cache_->Release(handle); is_cache_miss = false; } else { - if (access.no_insert == Boolean::kFalse && admit) { + if (access.no_insert == Boolean::kFalse && admit && access.block_size > 0) { sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size, - /*deleter=*/nullptr, /*handle=*/nullptr); + /*deleter=*/nullptr); } } - UpdateMetrics(is_user_access, is_cache_miss); + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + is_cache_miss); } -void CacheSimulator::UpdateMetrics(bool is_user_access, bool is_cache_miss) { +void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms, + bool is_user_access, bool is_cache_miss) { + uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond; + num_accesses_timeline_[timestamp_in_seconds] += 1; num_accesses_ += 1; + if (num_misses_timeline_.find(timestamp_in_seconds) == + num_misses_timeline_.end()) { + num_misses_timeline_[timestamp_in_seconds] = 0; + } if (is_cache_miss) { num_misses_ += 1; + num_misses_timeline_[timestamp_in_seconds] += 1; } if (is_user_access) { user_accesses_ += 1; @@ -76,8 +86,8 @@ Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority( void PrioritizedCacheSimulator::AccessKVPair( const Slice& key, uint64_t value_size, Cache::Priority priority, - bool no_insert, bool is_user_access, bool* is_cache_miss, bool* admitted, - bool update_metrics) { + const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access, + bool* is_cache_miss, bool* admitted, bool update_metrics) { assert(is_cache_miss); assert(admitted); *is_cache_miss = true; @@ -90,11 +100,12 @@ void PrioritizedCacheSimulator::AccessKVPair( sim_cache_->Release(handle); *is_cache_miss = false; } else if (!no_insert && *admitted && value_size > 0) { - sim_cache_->Insert(key, /*value=*/nullptr, value_size, - /*deleter=*/nullptr, /*handle=*/nullptr, priority); + sim_cache_->Insert(key, /*value=*/nullptr, value_size, /*deleter=*/nullptr, + /*handle=*/nullptr, priority); } if (update_metrics) { - UpdateMetrics(is_user_access, *is_cache_miss); + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + *is_cache_miss); } } @@ -102,38 +113,28 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { bool is_cache_miss = true; bool admitted = true; AccessKVPair(access.block_key, access.block_size, - ComputeBlockPriority(access), access.no_insert, + ComputeBlockPriority(access), access, access.no_insert, BlockCacheTraceHelper::IsUserAccess(access.caller), &is_cache_miss, &admitted, /*update_metrics=*/true); } -std::string HybridRowBlockCacheSimulator::ComputeRowKey( - const BlockCacheTraceRecord& access) { - assert(access.get_id != BlockCacheTraceHelper::kReservedGetId); - Slice key = ExtractUserKey(access.referenced_key); - uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse - ? 0 - : 1 + GetInternalKeySeqno(access.referenced_key); - return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" + - std::to_string(seq_no); -} - void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { - bool is_cache_miss = true; - bool admitted = true; // TODO (haoyu): We only support Get for now. We need to extend the tracing // for MultiGet, i.e., non-data block accesses must log all keys in a // MultiGet. + bool is_cache_miss = false; + bool admitted = false; if (access.caller == TableReaderCaller::kUserGet && access.get_id != BlockCacheTraceHelper::kReservedGetId) { // This is a Get/MultiGet request. - const std::string& row_key = ComputeRowKey(access); + const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access); if (getid_getkeys_map_[access.get_id].find(row_key) == getid_getkeys_map_[access.get_id].end()) { // This is the first time that this key is accessed. Look up the key-value // pair first. Do not update the miss/accesses metrics here since it will // be updated later. AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH, + access, /*no_insert=*/false, /*is_user_access=*/true, &is_cache_miss, &admitted, /*update_metrics=*/false); @@ -154,28 +155,31 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { // referenced key-value pair already. Thus, we treat these lookups as // hits. This is also to ensure the total number of accesses are the same // when comparing to other policies. - UpdateMetrics(/*is_user_access=*/true, /*is_cache_miss=*/false); + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); return; } // The key-value pair observes a cache miss. We need to access its // index/filter/data blocks. AccessKVPair( access.block_key, access.block_type, ComputeBlockPriority(access), + access, /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert, /*is_user_access=*/true, &is_cache_miss, &admitted, /*update_metrics=*/true); if (access.referenced_data_size > 0 && miss_inserted.second == InsertResult::ADMITTED) { - sim_cache_->Insert( - row_key, /*value=*/nullptr, access.referenced_data_size, - /*deleter=*/nullptr, /*handle=*/nullptr, Cache::Priority::HIGH); + sim_cache_->Insert(row_key, /*value=*/nullptr, + access.referenced_data_size, /*deleter=*/nullptr, + /*handle=*/nullptr, Cache::Priority::HIGH); getid_getkeys_map_[access.get_id][row_key] = std::make_pair(true, InsertResult::INSERTED); } return; } AccessKVPair(access.block_key, access.block_size, - ComputeBlockPriority(access), access.no_insert, + ComputeBlockPriority(access), access, access.no_insert, BlockCacheTraceHelper::IsUserAccess(access.caller), &is_cache_miss, &admitted, /*update_metrics=*/true); } diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h index 82972688658..3863fcf88dd 100644 --- a/utilities/simulator_cache/cache_simulator.h +++ b/utilities/simulator_cache/cache_simulator.h @@ -5,6 +5,9 @@ #pragma once +#include + +#include "cache/lru_cache.h" #include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -29,6 +32,51 @@ struct CacheConfiguration { } }; +class MissRatioStats { + public: + void reset_counter() { + num_misses_ = 0; + num_accesses_ = 0; + user_accesses_ = 0; + user_misses_ = 0; + } + double miss_ratio() const { + if (num_accesses_ == 0) { + return -1; + } + return static_cast(num_misses_ * 100.0 / num_accesses_); + } + uint64_t total_accesses() const { return num_accesses_; } + + const std::map& num_accesses_timeline() const { + return num_accesses_timeline_; + } + + const std::map& num_misses_timeline() const { + return num_misses_timeline_; + } + + double user_miss_ratio() const { + if (user_accesses_ == 0) { + return -1; + } + return static_cast(user_misses_ * 100.0 / user_accesses_); + } + uint64_t user_accesses() const { return user_accesses_; } + + void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access, + bool is_cache_miss); + + private: + uint64_t num_accesses_ = 0; + uint64_t num_misses_ = 0; + uint64_t user_accesses_ = 0; + uint64_t user_misses_ = 0; + + std::map num_accesses_timeline_; + std::map num_misses_timeline_; +}; + // A ghost cache admits an entry on its second access. class GhostCache { public: @@ -61,37 +109,15 @@ class CacheSimulator { CacheSimulator& operator=(CacheSimulator&&) = delete; virtual void Access(const BlockCacheTraceRecord& access); - void reset_counter() { - num_misses_ = 0; - num_accesses_ = 0; - user_accesses_ = 0; - user_misses_ = 0; - } - double miss_ratio() const { - if (num_accesses_ == 0) { - return -1; - } - return static_cast(num_misses_ * 100.0 / num_accesses_); - } - uint64_t total_accesses() const { return num_accesses_; } - double user_miss_ratio() const { - if (user_accesses_ == 0) { - return -1; - } - return static_cast(user_misses_ * 100.0 / user_accesses_); - } - uint64_t user_accesses() const { return user_accesses_; } + void reset_counter() { miss_ratio_stats_.reset_counter(); } - protected: - void UpdateMetrics(bool is_user_access, bool is_cache_miss); + const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; } + protected: + MissRatioStats miss_ratio_stats_; std::unique_ptr ghost_cache_; std::shared_ptr sim_cache_; - uint64_t num_accesses_ = 0; - uint64_t num_misses_ = 0; - uint64_t user_accesses_ = 0; - uint64_t user_misses_ = 0; }; // A prioritized cache simulator that runs against a block cache trace. @@ -107,7 +133,8 @@ class PrioritizedCacheSimulator : public CacheSimulator { protected: // Access the key-value pair and returns true upon a cache miss. void AccessKVPair(const Slice& key, uint64_t value_size, - Cache::Priority priority, bool no_insert, + Cache::Priority priority, + const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access, bool* is_cache_miss, bool* admitted, bool update_metrics); @@ -135,10 +162,6 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { void Access(const BlockCacheTraceRecord& access) override; private: - // Row key is a concatenation of the access's fd_number and the referenced - // user key. - std::string ComputeRowKey(const BlockCacheTraceRecord& access); - enum InsertResult : char { INSERTED, ADMITTED, diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc index f435785e6a1..dc3b8327e01 100644 --- a/utilities/simulator_cache/cache_simulator_test.cc +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -94,21 +94,21 @@ TEST_F(CacheSimulatorTest, CacheSimulator) { new CacheSimulator(nullptr, sim_cache)); cache_simulator->Access(access); cache_simulator->Access(access); - ASSERT_EQ(2, cache_simulator->total_accesses()); - ASSERT_EQ(50, cache_simulator->miss_ratio()); - ASSERT_EQ(2, cache_simulator->user_accesses()); - ASSERT_EQ(50, cache_simulator->user_miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio()); cache_simulator->Access(compaction_access); cache_simulator->Access(compaction_access); - ASSERT_EQ(4, cache_simulator->total_accesses()); - ASSERT_EQ(75, cache_simulator->miss_ratio()); - ASSERT_EQ(2, cache_simulator->user_accesses()); - ASSERT_EQ(50, cache_simulator->user_miss_ratio()); + ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio()); cache_simulator->reset_counter(); - ASSERT_EQ(0, cache_simulator->total_accesses()); - ASSERT_EQ(-1, cache_simulator->miss_ratio()); + ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio()); auto handle = sim_cache->Lookup(access.block_key); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); @@ -129,9 +129,9 @@ TEST_F(CacheSimulatorTest, GhostCacheSimulator) { /*high_pri_pool_ratio=*/0))); cache_simulator->Access(access); cache_simulator->Access(access); - ASSERT_EQ(2, cache_simulator->total_accesses()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); // Both of them will be miss since we have a ghost cache. - ASSERT_EQ(100, cache_simulator->miss_ratio()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); } TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) { @@ -144,8 +144,8 @@ TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) { new PrioritizedCacheSimulator(nullptr, sim_cache)); cache_simulator->Access(access); cache_simulator->Access(access); - ASSERT_EQ(2, cache_simulator->total_accesses()); - ASSERT_EQ(50, cache_simulator->miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio()); auto handle = sim_cache->Lookup(access.block_key); ASSERT_NE(nullptr, handle); @@ -166,9 +166,9 @@ TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) { /*high_pri_pool_ratio=*/0))); cache_simulator->Access(access); cache_simulator->Access(access); - ASSERT_EQ(2, cache_simulator->total_accesses()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); // Both of them will be miss since we have a ghost cache. - ASSERT_EQ(100, cache_simulator->miss_ratio()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); } TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { @@ -200,10 +200,11 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { cache_simulator->Access(first_get); block_id++; } - ASSERT_EQ(10, cache_simulator->total_accesses()); - ASSERT_EQ(100, cache_simulator->miss_ratio()); - ASSERT_EQ(10, cache_simulator->user_accesses()); - ASSERT_EQ(100, cache_simulator->user_miss_ratio()); + + ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio()); auto handle = sim_cache->Lookup( std::to_string(first_get.sst_fd_number) + "_" + ExtractUserKey(first_get.referenced_key).ToString() + "_" + @@ -225,10 +226,12 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { cache_simulator->Access(second_get); block_id++; } - ASSERT_EQ(15, cache_simulator->total_accesses()); - ASSERT_EQ(66, static_cast(cache_simulator->miss_ratio())); - ASSERT_EQ(15, cache_simulator->user_accesses()); - ASSERT_EQ(66, static_cast(cache_simulator->user_miss_ratio())); + ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(66, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(66, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); handle = sim_cache->Lookup( std::to_string(second_get.sst_fd_number) + "_" + ExtractUserKey(second_get.referenced_key).ToString() + "_" + @@ -252,10 +255,12 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { cache_simulator->Access(third_get); block_id++; } - ASSERT_EQ(20, cache_simulator->total_accesses()); - ASSERT_EQ(75, static_cast(cache_simulator->miss_ratio())); - ASSERT_EQ(20, cache_simulator->user_accesses()); - ASSERT_EQ(75, static_cast(cache_simulator->user_miss_ratio())); + ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(75, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(75, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); // Assert that the third key is not inserted into the cache. handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" + third_get.referenced_key); @@ -318,19 +323,21 @@ TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) { // Two get requests access the same key. cache_simulator->Access(first_get); cache_simulator->Access(second_get); - ASSERT_EQ(2, cache_simulator->total_accesses()); - ASSERT_EQ(100, cache_simulator->miss_ratio()); - ASSERT_EQ(2, cache_simulator->user_accesses()); - ASSERT_EQ(100, cache_simulator->user_miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio()); // We insert the key-value pair upon the second get request. A third get // request should observe a hit. for (uint32_t i = 0; i < 10; i++) { cache_simulator->Access(third_get); } - ASSERT_EQ(12, cache_simulator->total_accesses()); - ASSERT_EQ(16, static_cast(cache_simulator->miss_ratio())); - ASSERT_EQ(12, cache_simulator->user_accesses()); - ASSERT_EQ(16, static_cast(cache_simulator->user_miss_ratio())); + ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(16, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(16, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); } } // namespace rocksdb From 66b5613d0c3f84e5ef72c43b62a2e9866efdde8a Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 22 Jul 2019 18:53:03 -0700 Subject: [PATCH 242/572] row_cache to share entry for recent snapshots (#5600) Summary: Right now, users cannot take advantage of row cache, unless no snapshot is used, or Get() is repeated for the same snapshots. This limits the usage of row cache. This change eliminate this restriction in some cases. If the snapshot used is newer than the largest sequence number in the file, and write callback function is not registered, the same row cache key is used as no snapshot is given. We still need the callback function restriction for now because the callback function may filter out different keys for different snapshots even if the snapshots are new. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5600 Test Plan: Add a unit test. Differential Revision: D16386616 fbshipit-source-id: 6b7d214bd215d191b03ccf55926ad4b703ec2e53 --- HISTORY.md | 1 + db/db_test2.cc | 46 +++++++++++++++++++++++++++++++++++++++++++++ db/table_cache.cc | 21 +++++++++++++++++++-- table/get_context.h | 2 ++ 4 files changed, 68 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b9d0f741317..efd49f642b0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -33,6 +33,7 @@ * Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel +* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. diff --git a/db/db_test2.cc b/db/db_test2.cc index 109a7a377bf..3664b3a249f 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { delete db_; db_ = nullptr; } + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RowCacheSnapshot) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar1")); + + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("foo2", "bar")); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(Put("foo3", "bar")); + const Snapshot* s3 = db_->GetSnapshot(); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s2), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s3), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + db_->ReleaseSnapshot(s1); + db_->ReleaseSnapshot(s2); + db_->ReleaseSnapshot(s3); +} +#endif // ROCKSDB_LITE } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 121d4941fc0..2290b5939c5 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" #include "db/version_edit.h" #include "file/filename.h" @@ -24,6 +25,7 @@ #include "table/table_builder.h" #include "table/table_reader.h" #include "test_util/sync_point.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" @@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options, // sequence key increases. However, to support caching snapshot // reads, we append the sequence number (incremented by 1 to // distinguish from 0) only in this case. - uint64_t seq_no = - options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k); + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check( + options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(k); + } // Compute row cache key. row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), diff --git a/table/get_context.h b/table/get_context.h index 7a37beb2df2..7110ceae806 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -136,6 +136,8 @@ class GetContext { void ReportCounters(); + bool has_callback() const { return callback_ != nullptr; } + uint64_t get_tracing_get_id() const { return tracing_get_id_; } private: From 327c4807a7fe8532326323e2753670daf06a0f6b Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 22 Jul 2019 20:01:25 -0700 Subject: [PATCH 243/572] Disable refresh snapshot feature by default (#5606) Summary: There are concerns about the correctness of this patch. Disabling by default until the concerns are resolved. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5606 Differential Revision: D16428064 Pulled By: maysamyabandeh fbshipit-source-id: a89280f0ea85796c9c9dfbfd9a8e91dad9b000b3 --- HISTORY.md | 2 +- include/rocksdb/options.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index efd49f642b0..19f4ce1297c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -20,7 +20,7 @@ * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 ### New Features -* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +* Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 234af6a31eb..35c27556553 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -275,10 +275,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // this option helps reducing the cpu usage of long-running compactions. The // feature is disabled when max_subcompactions is greater than one. // - // Default: 0.1s + // Default: 0 // // Dynamically changeable through SetOptions() API - uint64_t snap_refresh_nanos = 100 * 1000 * 1000; // 0.1s + uint64_t snap_refresh_nanos = 0; // Disable automatic compactions. Manual compactions can still // be issued on this column family From eae832740b16f9c2fbe2225f9c3eef0c0a1e1f48 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Tue, 23 Jul 2019 08:04:58 -0700 Subject: [PATCH 244/572] WriteUnPrepared: improve read your own write functionality (#5573) Summary: There are a number of fixes in this PR (with most bugs found via the added stress tests): 1. Re-enable reseek optimization. This was initially disabled to avoid infinite loops in https://github.com/facebook/rocksdb/pull/3955 but this can be resolved by remembering not to reseek after a reseek has already been done. This problem only affects forward iteration in `DBIter::FindNextUserEntryInternal`, as we already disable reseeking in `DBIter::FindValueForCurrentKeyUsingSeek`. 2. Verify that ReadOption.snapshot can be safely used for iterator creation. Some snapshots would not give correct results because snaphsot validation would not be enforced, breaking some assumptions in Prev() iteration. 3. In the non-snapshot Get() case, reads done at `LastPublishedSequence` may not be enough, because unprepared sequence numbers are not published. Use `std::max(published_seq, max_visible_seq)` to do lookups instead. 4. Add stress test to test reading own writes. 5. Minor bug in the allow_concurrent_memtable_write case where we forgot to pass in batch_per_txn_. 6. Minor performance optimization in `CalcMaxUnpreparedSequenceNumber` by assigning by reference instead of value. 7. Add some more comments everywhere. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5573 Differential Revision: D16276089 Pulled By: lth fbshipit-source-id: 18029c944eb427a90a87dee76ac1b23f37ec1ccb --- db/db_impl/db_impl.cc | 15 + db/db_impl/db_impl_write.cc | 3 +- db/db_iter.cc | 33 +- db/read_callback.h | 3 - utilities/transactions/transaction_test.cc | 16 + .../write_unprepared_transaction_test.cc | 345 +++++++++++------- .../transactions/write_unprepared_txn.cc | 80 ++-- utilities/transactions/write_unprepared_txn.h | 57 ++- .../transactions/write_unprepared_txn_db.cc | 82 ++++- 9 files changed, 436 insertions(+), 198 deletions(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index af9aea011a3..8132d5a0b38 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1499,7 +1499,22 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, ? versions_->LastSequence() : versions_->LastPublishedSequence(); if (callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = callback->max_visible_seq(); } } TEST_SYNC_POINT("DBImpl::GetImpl:3"); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index c0d320013b7..95a1b31c769 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -172,7 +172,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, w.status = WriteBatchInternal::InsertInto( &w, w.sequence, &column_family_memtables, &flush_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, - true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt); + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_); PERF_TIMER_START(write_pre_and_post_process_time); } diff --git a/db/db_iter.cc b/db/db_iter.cc index 633724c5763..060138fd64b 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -263,12 +263,6 @@ class DBIter final: public Iterator { bool TooManyInternalKeysSkipped(bool increment = true); inline bool IsVisible(SequenceNumber sequence); - // CanReseekToSkip() returns whether the iterator can use the optimization - // where it reseek by sequence number to get the next key when there are too - // many versions. This is disabled for write unprepared because seeking to - // sequence number does not guarantee that it is visible. - inline bool CanReseekToSkip(); - // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() // is called void TempPinData() { @@ -453,6 +447,11 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // greater than that, // - none of the above : saved_key_ can contain anything, it doesn't matter. uint64_t num_skipped = 0; + // For write unprepared, the target sequence number in reseek could be larger + // than the snapshot, and thus needs to be skipped again. This could result in + // an infinite loop of reseeks. To avoid that, we limit the number of reseeks + // to one. + bool reseek_done = false; is_blob_ = false; @@ -498,6 +497,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) assert(!skipping || user_comparator_.Compare( ikey_.user_key, saved_key_.GetUserKey()) > 0); num_skipped = 0; + reseek_done = false; switch (ikey_.type) { case kTypeDeletion: case kTypeSingleDeletion: @@ -551,6 +551,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // they are hidden by this deletion. skipping = true; num_skipped = 0; + reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } else if (ikey_.type == kTypeBlobIndex) { if (!allow_blob_) { @@ -581,6 +582,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // they are hidden by this deletion. skipping = true; num_skipped = 0; + reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } else { // By now, we are sure the current ikey is going to yield a @@ -611,14 +613,23 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); skipping = false; num_skipped = 0; + reseek_done = false; } } // If we have sequentially iterated via numerous equal keys, then it's // better to seek so that we can avoid too many key comparisons. - if (num_skipped > max_skip_ && CanReseekToSkip()) { + // + // To avoid infinite loops, do not reseek if we have already attempted to + // reseek previously. + // + // TODO(lth): If we reseek to sequence number greater than ikey_.sequence, + // than it does not make sense to reseek as we would actually land further + // away from the desired key. There is opportunity for optimization here. + if (num_skipped > max_skip_ && !reseek_done) { is_key_seqnum_zero_ = false; num_skipped = 0; + reseek_done = true; std::string last_key; if (skipping) { // We're looking for the next user-key but all we see are the same @@ -937,7 +948,7 @@ bool DBIter::FindValueForCurrentKey() { // This user key has lots of entries. // We're going from old to new, and it's taking too long. Let's do a Seek() // and go from new to old. This helps when a key was overwritten many times. - if (num_skipped >= max_skip_ && CanReseekToSkip()) { + if (num_skipped >= max_skip_) { return FindValueForCurrentKeyUsingSeek(); } @@ -1234,7 +1245,7 @@ bool DBIter::FindUserKeyBeforeSavedKey() { PERF_COUNTER_ADD(internal_key_skipped_count, 1); } - if (num_skipped >= max_skip_ && CanReseekToSkip()) { + if (num_skipped >= max_skip_) { num_skipped = 0; IterKey last_key; last_key.SetInternalKey(ParsedInternalKey( @@ -1281,10 +1292,6 @@ bool DBIter::IsVisible(SequenceNumber sequence) { } } -bool DBIter::CanReseekToSkip() { - return read_callback_ == nullptr || read_callback_->CanReseekToSkip(); -} - void DBIter::Seek(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); StopWatch sw(env_, statistics_, DB_SEEK); diff --git a/db/read_callback.h b/db/read_callback.h index 60f91ef872d..d8801e65173 100644 --- a/db/read_callback.h +++ b/db/read_callback.h @@ -42,9 +42,6 @@ class ReadCallback { // Refresh to a more recent visible seq virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; } - // Refer to DBIter::CanReseekToSkip - virtual bool CanReseekToSkip() { return true; } - protected: // The max visible seq, it is usually the snapshot but could be larger if // transaction has its own writes written to db. diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index a410c5b5196..7868d0060e9 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -3471,6 +3471,12 @@ TEST_P(TransactionTest, LockLimitTest) { } TEST_P(TransactionTest, IteratorTest) { + // This test does writes without snapshot validation, and then tries to create + // iterator later, which is unsupported in write unprepared. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + WriteOptions write_options; ReadOptions read_options, snapshot_read_options; std::string value; @@ -3589,6 +3595,16 @@ TEST_P(TransactionTest, IteratorTest) { } TEST_P(TransactionTest, DisableIndexingTest) { + // Skip this test for write unprepared. It does not solely rely on WBWI for + // read your own writes, so depending on whether batches are flushed or not, + // only some writes will be visible. + // + // Also, write unprepared does not support creating iterators if there has + // been txn->Put() without snapshot validation. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + WriteOptions write_options; ReadOptions read_options; std::string value; diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index faa6c774578..a2546229e4d 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -37,6 +37,9 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, true, WRITE_UNPREPARED))); TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { + // The following tests checks whether reading your own write for + // a transaction works for write unprepared, when there are uncommitted + // values written into DB. auto verify_state = [](Iterator* iter, const std::string& key, const std::string& value) { ASSERT_TRUE(iter->Valid()); @@ -45,155 +48,251 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { ASSERT_EQ(value, iter->value().ToString()); }; - options.disable_auto_compactions = true; - ReOpen(); - - // The following tests checks whether reading your own write for - // a transaction works for write unprepared, when there are uncommitted - // values written into DB. - // - // Although the values written by DB::Put are technically committed, we add - // their seq num to unprep_seqs_ to pretend that they were written into DB - // as part of an unprepared batch, and then check if they are visible to the - // transaction. - auto snapshot0 = db->GetSnapshot(); - ASSERT_OK(db->Put(WriteOptions(), "a", "v1")); - ASSERT_OK(db->Put(WriteOptions(), "b", "v2")); - auto snapshot2 = db->GetSnapshot(); - ASSERT_OK(db->Put(WriteOptions(), "a", "v3")); - ASSERT_OK(db->Put(WriteOptions(), "b", "v4")); - auto snapshot4 = db->GetSnapshot(); - ASSERT_OK(db->Put(WriteOptions(), "a", "v5")); - ASSERT_OK(db->Put(WriteOptions(), "b", "v6")); - auto snapshot6 = db->GetSnapshot(); - ASSERT_OK(db->Put(WriteOptions(), "a", "v7")); - ASSERT_OK(db->Put(WriteOptions(), "b", "v8")); - auto snapshot8 = db->GetSnapshot(); - - TransactionOptions txn_options; - WriteOptions write_options; - Transaction* txn = db->BeginTransaction(write_options, txn_options); - WriteUnpreparedTxn* wup_txn = dynamic_cast(txn); - - ReadOptions roptions; - roptions.snapshot = snapshot0; - - wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] = - snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber(); - auto iter = txn->GetIterator(roptions); + // Test always reseeking vs never reseeking. + for (uint64_t max_skip : {0, std::numeric_limits::max()}) { + options.max_sequential_skip_in_iterations = max_skip; + options.disable_auto_compactions = true; + ReOpen(); - // Test Get(). - std::string value; + TransactionOptions txn_options; + WriteOptions woptions; + ReadOptions roptions; - ASSERT_OK(txn->Get(roptions, Slice("a"), &value)); - ASSERT_EQ(value, "v3"); + ASSERT_OK(db->Put(woptions, "a", "")); + ASSERT_OK(db->Put(woptions, "b", "")); - ASSERT_OK(txn->Get(roptions, Slice("b"), &value)); - ASSERT_EQ(value, "v4"); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast(txn); + txn->SetSnapshot(); - wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] = - snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber(); - delete iter; - iter = txn->GetIterator(roptions); + for (int i = 0; i < 5; i++) { + std::string stored_value = "v" + ToString(i); + ASSERT_OK(txn->Put("a", stored_value)); + ASSERT_OK(txn->Put("b", stored_value)); + wup_txn->FlushWriteBatchToDB(false); - ASSERT_OK(txn->Get(roptions, Slice("a"), &value)); - ASSERT_EQ(value, "v7"); + // Test Get() + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, stored_value); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, stored_value); - ASSERT_OK(txn->Get(roptions, Slice("b"), &value)); - ASSERT_EQ(value, "v8"); + // Test Next() + auto iter = txn->GetIterator(roptions); + iter->Seek("a"); + verify_state(iter, "a", stored_value); - wup_txn->unprep_seqs_.clear(); + iter->Next(); + verify_state(iter, "b", stored_value); - // Test Next(). - wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] = - snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber(); - delete iter; - iter = txn->GetIterator(roptions); + iter->SeekToFirst(); + verify_state(iter, "a", stored_value); - iter->Seek("a"); - verify_state(iter, "a", "v3"); + iter->Next(); + verify_state(iter, "b", stored_value); - iter->Next(); - verify_state(iter, "b", "v4"); + delete iter; - iter->SeekToFirst(); - verify_state(iter, "a", "v3"); + // Test Prev() + iter = txn->GetIterator(roptions); + iter->SeekForPrev("b"); + verify_state(iter, "b", stored_value); - iter->Next(); - verify_state(iter, "b", "v4"); + iter->Prev(); + verify_state(iter, "a", stored_value); - wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] = - snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber(); - delete iter; - iter = txn->GetIterator(roptions); + iter->SeekToLast(); + verify_state(iter, "b", stored_value); - iter->Seek("a"); - verify_state(iter, "a", "v7"); + iter->Prev(); + verify_state(iter, "a", stored_value); - iter->Next(); - verify_state(iter, "b", "v8"); - - iter->SeekToFirst(); - verify_state(iter, "a", "v7"); - - iter->Next(); - verify_state(iter, "b", "v8"); - - wup_txn->unprep_seqs_.clear(); - - // Test Prev(). For Prev(), we need to adjust the snapshot to match what is - // possible in WriteUnpreparedTxn. - // - // Because of row locks and ValidateSnapshot, there cannot be any committed - // entries after snapshot, but before the first prepared key. - roptions.snapshot = snapshot2; - wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] = - snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber(); - delete iter; - iter = txn->GetIterator(roptions); + delete iter; + } - iter->SeekForPrev("b"); - verify_state(iter, "b", "v4"); + delete txn; + } +} - iter->Prev(); - verify_state(iter, "a", "v3"); +TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) { + // This is a stress test where different threads are writing random keys, and + // then before committing or aborting the transaction, it validates to see + // that it can read the keys it wrote, and the keys it did not write respect + // the snapshot. To avoid row lock contention (and simply stressing the + // locking system), each thread is mostly only writing to its own set of keys. + const uint32_t kNumIter = 1000; + const uint32_t kNumThreads = 10; + const uint32_t kNumKeys = 5; + + std::default_random_engine rand(static_cast( + std::hash()(std::this_thread::get_id()))); + + enum Action { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT }; + // Test with + // 1. no snapshots set + // 2. snapshot set on ReadOptions + // 3. snapshot set, and refreshing after every write. + for (Action a : {NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT}) { + WriteOptions write_options; + txn_db_options.transaction_lock_timeout = -1; + options.disable_auto_compactions = true; + ReOpen(); + + std::vector keys; + for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) { + keys.push_back("k" + ToString(k)); + } + std::shuffle(keys.begin(), keys.end(), rand); + + // This counter will act as a "sequence number" to help us validate + // visibility logic with snapshots. If we had direct access to the seqno of + // snapshots and key/values, then we should directly compare those instead. + std::atomic counter(0); + + std::function stress_thread = [&](int id) { + size_t tid = std::hash()(std::this_thread::get_id()); + Random64 rnd(static_cast(tid)); + + Transaction* txn; + TransactionOptions txn_options; + // batch_size of 1 causes writes to DB for every marker. + txn_options.max_write_batch_size = 1; + ReadOptions read_options; + + for (uint32_t i = 0; i < kNumIter; i++) { + std::set owned_keys(&keys[id * kNumKeys], + &keys[(id + 1) * kNumKeys]); + // Add unowned keys to make the workload more interesting, but this + // increases row lock contention, so just do it sometimes. + if (rnd.OneIn(2)) { + owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]); + } - iter->SeekToLast(); - verify_state(iter, "b", "v4"); + txn = db->BeginTransaction(write_options, txn_options); + txn->SetName(ToString(id)); + txn->SetSnapshot(); + if (a >= RO_SNAPSHOT) { + read_options.snapshot = txn->GetSnapshot(); + ASSERT_TRUE(read_options.snapshot != nullptr); + } - iter->Prev(); - verify_state(iter, "a", "v3"); + uint64_t buf[2]; + buf[0] = id; - roptions.snapshot = snapshot6; - wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] = - snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber(); - delete iter; - iter = txn->GetIterator(roptions); + // When scanning through the database, make sure that all unprepared + // keys have value >= snapshot and all other keys have value < snapshot. + int64_t snapshot_num = counter.fetch_add(1); - iter->SeekForPrev("b"); - verify_state(iter, "b", "v8"); + Status s; + for (const auto& key : owned_keys) { + buf[1] = counter.fetch_add(1); + s = txn->Put(key, Slice((const char*)buf, sizeof(buf))); + if (!s.ok()) { + break; + } + if (a == REFRESH_SNAPSHOT) { + txn->SetSnapshot(); + read_options.snapshot = txn->GetSnapshot(); + snapshot_num = counter.fetch_add(1); + } + } - iter->Prev(); - verify_state(iter, "a", "v7"); + // Failure is possible due to snapshot validation. In this case, + // rollback and move onto next iteration. + if (!s.ok()) { + ASSERT_TRUE(s.IsBusy()); + ASSERT_OK(txn->Rollback()); + delete txn; + continue; + } - iter->SeekToLast(); - verify_state(iter, "b", "v8"); + auto verify_key = [&owned_keys, &a, &id, &snapshot_num]( + const std::string& key, + const std::string& value) { + if (owned_keys.count(key) > 0) { + ASSERT_EQ(value.size(), 16); + + // Since this key is part of owned_keys, then this key must be + // unprepared by this transaction identified by 'id' + ASSERT_EQ(((int64_t*)value.c_str())[0], id); + if (a == REFRESH_SNAPSHOT) { + // If refresh snapshot is true, then the snapshot is refreshed + // after every Put(), meaning that the current snapshot in + // snapshot_num must be greater than the "seqno" of any keys + // written by the current transaction. + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } else { + // If refresh snapshot is not on, then the snapshot was taken at + // the beginning of the transaction, meaning all writes must come + // after snapshot_num + ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num); + } + } else if (a >= RO_SNAPSHOT) { + // If this is not an unprepared key, just assert that the key + // "seqno" is smaller than the snapshot seqno. + ASSERT_EQ(value.size(), 16); + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } + }; + + // Validate Get()/Next()/Prev(). Do only one of them to save time, and + // reduce lock contention. + switch (rnd.Uniform(3)) { + case 0: // Validate Get() + { + for (const auto& key : keys) { + std::string value; + s = txn->Get(read_options, Slice(key), &value); + if (!s.ok()) { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(owned_keys.count(key), 0); + } else { + verify_key(key, value); + } + } + break; + } + case 1: // Validate Next() + { + Iterator* iter = txn->GetIterator(read_options); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + delete iter; + break; + } + case 2: // Validate Prev() + { + Iterator* iter = txn->GetIterator(read_options); + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + delete iter; + break; + } + default: + ASSERT_TRUE(false); + } - iter->Prev(); - verify_state(iter, "a", "v7"); + if (rnd.OneIn(2)) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + delete txn; + } + }; - // Since the unprep_seqs_ data were faked for testing, we do not want the - // destructor for the transaction to be rolling back data that did not - // exist. - wup_txn->unprep_seqs_.clear(); + std::vector threads; + for (uint32_t i = 0; i < kNumThreads; i++) { + threads.emplace_back(stress_thread, i); + } - db->ReleaseSnapshot(snapshot0); - db->ReleaseSnapshot(snapshot2); - db->ReleaseSnapshot(snapshot4); - db->ReleaseSnapshot(snapshot6); - db->ReleaseSnapshot(snapshot8); - delete iter; - delete txn; + for (auto& t : threads) { + t.join(); + } + } } // This tests how write unprepared behaves during recovery when the DB crashes diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index d127220e47d..4d1401b3aa1 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -32,7 +32,7 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber( WriteUnpreparedTxn* txn) { - auto unprep_seqs = txn->GetUnpreparedSequenceNumbers(); + const auto& unprep_seqs = txn->GetUnpreparedSequenceNumbers(); if (unprep_seqs.size()) { return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; } @@ -44,7 +44,8 @@ WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, const TransactionOptions& txn_options) : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db), - recovered_txn_(false) { + recovered_txn_(false), + largest_validated_seq_(0) { max_write_batch_size_ = txn_options.max_write_batch_size; // We set max bytes to zero so that we don't get a memory limit error. // Instead of trying to keep write batch strictly under the size limit, we @@ -85,75 +86,82 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { write_batch_.SetMaxBytes(0); unprep_seqs_.clear(); recovered_txn_ = false; + largest_validated_seq_ = 0; } -Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value, - const bool assume_tracked) { +Status WriteUnpreparedTxn::HandleWrite(std::function do_write) { Status s = MaybeFlushWriteBatchToDB(); if (!s.ok()) { return s; } - return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + s = do_write(); + if (s.ok()) { + if (snapshot_) { + largest_validated_seq_ = + std::max(largest_validated_seq_, snapshot_->GetSequenceNumber()); + } else { + largest_validated_seq_ = kMaxSequenceNumber; + } + } + return s; +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); } Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, const SliceParts& key, const SliceParts& value, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); } Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::Merge(column_family, key, value, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::Merge(column_family, key, value, + assume_tracked); + }); } Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, const Slice& key, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); } Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, const SliceParts& key, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); } Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); } Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, const bool assume_tracked) { - Status s = MaybeFlushWriteBatchToDB(); - if (!s.ok()) { - return s; - } - return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked); + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); } // WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 15a76d13437..b64fd81e611 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -17,6 +17,40 @@ namespace rocksdb { class WriteUnpreparedTxnDB; class WriteUnpreparedTxn; +// WriteUnprepared transactions needs to be able to read their own uncommitted +// writes, and supporting this requires some careful consideration. Because +// writes in the current transaction may be flushed to DB already, we cannot +// rely on the contents of WriteBatchWithIndex to determine whether a key should +// be visible or not, so we have to remember to check the DB for any uncommitted +// keys that should be visible to us. First, we will need to change the seek to +// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq). +// Any key greater than max_visible_seq should not be visible because they +// cannot be unprepared by the current transaction and they are not in its +// snapshot. +// +// When we seek to max_visible_seq, one of these cases will happen: +// 1. We hit a unprepared key from the current transaction. +// 2. We hit a unprepared key from the another transaction. +// 3. We hit a committed key with snap_seq < seq < max_unprep_seq. +// 4. We hit a committed key with seq <= snap_seq. +// +// IsVisibleFullCheck handles all cases correctly. +// +// Other notes: +// Note that max_visible_seq is only calculated once at iterator construction +// time, meaning if the same transaction is adding more unprep seqs through +// writes during iteration, these newer writes may not be visible. This is not a +// problem for MySQL though because it avoids modifying the index as it is +// scanning through it to avoid the Halloween Problem. Instead, it scans the +// index once up front, and modifies based on a temporary copy. +// +// In DBIter, there is a "reseek" optimization if the iterator skips over too +// many keys. However, this assumes that the reseek seeks exactly to the +// required key. In write unprepared, even after seeking directly to +// max_visible_seq, some iteration may be required before hitting a visible key, +// and special precautions must be taken to avoid performing another reseek, +// leading to an infinite loop. +// class WriteUnpreparedTxnReadCallback : public ReadCallback { public: WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db, @@ -25,7 +59,7 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { WriteUnpreparedTxn* txn) // Pass our last uncommitted seq as the snapshot to the parent class to // ensure that the parent will not prematurely filter out own writes. We - // will do the exact comparison agaisnt snapshots in IsVisibleFullCheck + // will do the exact comparison against snapshots in IsVisibleFullCheck // override. : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted), db_(db), @@ -34,12 +68,6 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { virtual bool IsVisibleFullCheck(SequenceNumber seq) override; - bool CanReseekToSkip() override { - return wup_snapshot_ == max_visible_seq_; - // Otherwise our own writes uncommitted are in db, and the assumptions - // behind reseek optimizations are no longer valid. - } - void Refresh(SequenceNumber seq) override { max_visible_seq_ = std::max(max_visible_seq_, seq); wup_snapshot_ = seq; @@ -130,6 +158,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn { Status MaybeFlushWriteBatchToDB(); Status FlushWriteBatchToDB(bool prepared); + Status HandleWrite(std::function do_write); // For write unprepared, we check on every writebatch append to see if // max_write_batch_size_ has been exceeded, and then call @@ -153,6 +182,20 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // locked for efficiency reasons. For recovered transactions, skip unlocking // keys when transaction ends. bool recovered_txn_; + + // Track the largest sequence number at which we performed snapshot + // validation. If snapshot validation was skipped because no snapshot was set, + // then this is set to kMaxSequenceNumber. This value is useful because it + // means that for keys that have unprepared seqnos, we can guarantee that no + // committed keys by other transactions can exist between + // largest_validated_seq_ and max_unprep_seq. See + // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is + // necessary for iterator Prev(). + // + // Currently this value only increases during the lifetime of a transaction, + // but in some cases, we should be able to restore the previously largest + // value when calling RollbackToSavepoint. + SequenceNumber largest_validated_seq_; }; } // namespace rocksdb diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index c4be058bb96..c3fcd1f45d2 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -368,25 +368,77 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, constexpr bool ALLOW_BLOB = true; constexpr bool ALLOW_REFRESH = true; std::shared_ptr own_snapshot = nullptr; - SequenceNumber snapshot_seq; + SequenceNumber snapshot_seq = kMaxSequenceNumber; SequenceNumber min_uncommitted = 0; - if (options.snapshot != nullptr) { - snapshot_seq = options.snapshot->GetSequenceNumber(); - min_uncommitted = - static_cast_with_check( - options.snapshot) - ->min_uncommitted_; - } else { - auto* snapshot = GetSnapshot(); - // We take a snapshot to make sure that the related data in the commit map - // are not deleted. - snapshot_seq = snapshot->GetSequenceNumber(); - min_uncommitted = - static_cast_with_check(snapshot) - ->min_uncommitted_; + + // Currently, the Prev() iterator logic does not work well without snapshot + // validation. The logic simply iterates through values of a key in + // ascending seqno order, stopping at the first non-visible value and + // returning the last visible value. + // + // For example, if snapshot sequence is 3, and we have the following keys: + // foo: v1 1 + // foo: v2 2 + // foo: v3 3 + // foo: v4 4 + // foo: v5 5 + // + // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3, + // which is the last visible key. + // + // For unprepared transactions, if we have snap_seq = 3, but the current + // transaction has unprep_seq 5, then returning the first non-visible key + // would be incorrect, as we should return v5, and not v3. The problem is that + // there are committed keys at snapshot_seq < commit_seq < unprep_seq. + // + // Snapshot validation can prevent this problem by ensuring that no committed + // keys exist at snapshot_seq < commit_seq, and thus any value with a sequence + // number greater than snapshot_seq must be unprepared keys. For example, if + // the transaction had a snapshot at 3, then snapshot validation would be + // performed during the Put(v5) call. It would find v4, and the Put would fail + // with snapshot validation failure. + // + // Because of this, if any writes have occurred, then the transaction snapshot + // must be used for the iterator. If no writes have occurred though, we can + // simply create a snapshot. Later writes would not be visible though, but we + // don't support iterating while writing anyway. + // + // TODO(lth): Improve Prev() logic to continue iterating until + // max_visible_seq, and then return the last visible key, so that this + // restriction can be lifted. + const Snapshot* snapshot = nullptr; + if (options.snapshot == nullptr) { + snapshot = GetSnapshot(); own_snapshot = std::make_shared(db_impl_, snapshot); + } else { + snapshot = options.snapshot; } + + snapshot_seq = snapshot->GetSequenceNumber(); assert(snapshot_seq != kMaxSequenceNumber); + // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are + // guaranteed that for keys that were modified by this transaction (and thus + // might have unprepared versions), no committed versions exist at + // largest_validated_seq < commit_seq (or the contrapositive: any committed + // version must exist at commit_seq <= largest_validated_seq). This implies + // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <= + // snapshot_seq. As explained above, the problem with Prev() only happens when + // snapshot_seq < commit_seq. + // + // For keys that were not modified by this transaction, largest_validated_seq_ + // is meaningless, and Prev() should just work with the existing visibility + // logic. + if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() && + !txn->unprep_seqs_.empty()) { + ROCKS_LOG_ERROR(info_log_, + "WriteUnprepared iterator creation failed since the " + "transaction has performed unvalidated writes"); + return nullptr; + } + min_uncommitted = + static_cast_with_check(snapshot) + ->min_uncommitted_; + auto* cfd = reinterpret_cast(column_family)->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); From 112702ac6cd14dfb2f0fbf929216deabcf2ccafc Mon Sep 17 00:00:00 2001 From: anand76 Date: Tue, 23 Jul 2019 11:12:25 -0700 Subject: [PATCH 245/572] Parallelize file_reader_writer_test in order to reduce timeouts Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5608 Test Plan: make check buck test mode/dev-tsan internal_repo_rocksdb/repo:file_reader_writer_test -- --run-disabled Differential Revision: D16441796 Pulled By: anand1976 fbshipit-source-id: afbb88a9fcb1c0ba22215118767e8eab3d1d6a4a --- Makefile | 2 +- TARGETS | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f8a904bd39d..65d884fa4fb 100644 --- a/Makefile +++ b/Makefile @@ -480,7 +480,6 @@ TESTS = \ fault_injection_test \ filelock_test \ filename_test \ - file_reader_writer_test \ block_based_filter_block_test \ full_filter_block_test \ partitioned_filter_block_test \ @@ -580,6 +579,7 @@ PARALLEL_TEST = \ external_sst_file_test \ import_column_family_test \ fault_injection_test \ + file_reader_writer_test \ inlineskiplist_test \ manual_compaction_test \ persistent_cache_test \ diff --git a/TARGETS b/TARGETS index cfd9ef73d40..9246af36361 100644 --- a/TARGETS +++ b/TARGETS @@ -753,7 +753,7 @@ ROCKS_TESTS = [ [ "file_reader_writer_test", "util/file_reader_writer_test.cc", - "serial", + "parallel", ], [ "filelock_test", From 3782accf7de5830fef1fc88d69bbe2d9259b023f Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 23 Jul 2019 13:56:52 -0700 Subject: [PATCH 246/572] ldb sometimes specify a string-append merge operator (#5607) Summary: Right now, ldb cannot scan a DB with merge operands with default ldb. There is no hard to give a general merge operator so that it can at least print out something Pull Request resolved: https://github.com/facebook/rocksdb/pull/5607 Test Plan: Run ldb against a DB with merge operands and see the outputs. Differential Revision: D16442634 fbshipit-source-id: c66c414ec07f219cfc6e6ec2cc14c783ee95df54 --- HISTORY.md | 1 + tools/ldb_cmd.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 19f4ce1297c..04f194e9258 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path * Overload GetAllKeyVersions() to support non-default column family. * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 +* ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. ### New Features * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 22b2399a278..338f09fb992 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -31,6 +31,7 @@ #include "util/coding.h" #include "util/stderr_logger.h" #include "util/string_util.h" +#include "utilities/merge_operators.h" #include "utilities/ttl/db_ttl_impl.h" #include @@ -353,11 +354,24 @@ void LDBCommand::OpenDB() { stderr, "wal_dir loaded from the option file doesn't exist. Ignore it.\n"); } + + // If merge operator is not set, set a string append operator. There is + // no harm doing it. + for (auto& cf_entry : column_families_) { + if (!cf_entry.options.merge_operator) { + cf_entry.options.merge_operator = + MergeOperators::CreateStringAppendOperator(':'); + } + } } options_ = PrepareOptionsForOpenDB(); if (!exec_state_.IsNotStarted()) { return; } + if (column_families_.empty() && !options_.merge_operator) { + // No harm to add a general merge operator if it is not specified. + options_.merge_operator = MergeOperators::CreateStringAppendOperator(':'); + } // Open the DB. Status st; std::vector handles_opened; From 6b7fcc0d5f8c91f891f243906e6431969cfa8d11 Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Tue, 23 Jul 2019 15:30:59 -0700 Subject: [PATCH 247/572] Improve CPU Efficiency of ApproximateSize (part 1) (#5613) Summary: 1. Avoid creating the iterator in order to call BlockBasedTable::ApproximateOffsetOf(). Instead, directly call into it. 2. Optimize BlockBasedTable::ApproximateOffsetOf() keeps the index block iterator in stack. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5613 Differential Revision: D16442660 Pulled By: elipoz fbshipit-source-id: 9320be3e918c139b10e758cbbb684706d172e516 --- db/table_cache.cc | 28 ++++++++++++++++++- db/table_cache.h | 6 ++++ db/version_set.cc | 19 ++++--------- db/version_set.h | 4 +-- table/block_based/block_based_table_reader.cc | 12 ++++++-- table/block_based/block_based_table_reader.h | 7 +++-- 6 files changed, 55 insertions(+), 21 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index 2290b5939c5..48415beff34 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -194,7 +194,7 @@ InternalIterator* TableCache::NewIterator( if (table_reader == nullptr) { s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record read_stats */, file_read_hist, + !for_compaction /* record_read_stats */, file_read_hist, skip_filters, level); if (s.ok()) { table_reader = GetTableReaderFromHandle(handle); @@ -505,4 +505,30 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) { cache->Erase(GetSliceForFileNumber(&file_number)); } +uint64_t TableCache::ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(env_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateOffsetOf(key, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} } // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h index f9fd4815228..89a0b1b5c63 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -153,6 +153,12 @@ class TableCache { const FileDescriptor& fd, const SliceTransform* prefix_extractor = nullptr); + // Returns approximated offset of a key in a file represented by fd. + uint64_t ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + // Release the handle from a cache void ReleaseHandle(Cache::Handle* handle); diff --git a/db/version_set.cc b/db/version_set.cc index 559a4190f16..281065d0502 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4974,19 +4974,12 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, } else { // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. - TableReader* table_reader_ptr; - InternalIterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), - *f.file_metadata, nullptr /* range_del_agg */, - v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr, - /*file_read_hist=*/nullptr, caller, - /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, - /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); - if (table_reader_ptr != nullptr) { - result = table_reader_ptr->ApproximateOffsetOf(key, caller); - } - delete iter; + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache != nullptr) { + result = table_cache->ApproximateOffsetOf( + key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(), + v->GetMutableCFOptions().prefix_extractor.get()); + } } return result; } diff --git a/db/version_set.h b/db/version_set.h index 6b7c42881c1..ee94f5966df 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -655,7 +655,7 @@ class Version { uint64_t GetSstFilesSize(); - MutableCFOptions GetMutableCFOptions() { return mutable_cf_options_; } + const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } private: Env* env_; @@ -981,7 +981,7 @@ class VersionSet { void AddLiveFiles(std::vector* live_list); // Return the approximate size of data to be scanned for range [start, end) - // in levels [start_level, end_level). If end_level == 0 it will search + // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, int start_level, int end_level, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index fde11c0d362..000bc295fc1 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -4018,10 +4018,11 @@ Status BlockBasedTable::CreateIndexReader( uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) { BlockCacheLookupContext context(caller); - std::unique_ptr> index_iter( + IndexBlockIter iiter_on_stack; + auto index_iter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, - /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/&context)); + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); index_iter->Seek(key); uint64_t result; @@ -4041,6 +4042,11 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, result = rep_->footer.metaindex_handle().offset(); } } + + if (index_iter != &iiter_on_stack) { + delete index_iter; + } + return result; } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 189cd5d2e3a..3a16e2995fb 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -318,8 +318,11 @@ class BlockBasedTable : public TableReader { BlockCacheLookupContext* lookup_context) const; // Get the iterator from the index reader. - // If input_iter is not set, return new Iterator - // If input_iter is set, update it and return it as Iterator + // + // If input_iter is not set, return a new Iterator. + // If input_iter is set, try to update it and return it as Iterator. + // However note that in some cases the returned iterator may be different + // from input_iter. In such case the returned iterator should be freed. // // Note: ErrorIterator with Status::Incomplete shall be returned if all the // following conditions are met: From 092f41703798011db3cc118d1b32c8ca5ddf9749 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 23 Jul 2019 15:57:43 -0700 Subject: [PATCH 248/572] Move the uncompression dictionary object out of the block cache (#5584) Summary: RocksDB has historically stored uncompression dictionary objects in the block cache as opposed to storing just the block contents. This neccesitated evicting the object upon table close. With the new code, only the raw blocks are stored in the cache, eliminating the need for eviction. In addition, the patch makes the following improvements: 1) Compression dictionary blocks are now prefetched/pinned similarly to index/filter blocks. 2) A copy operation got eliminated when the uncompression dictionary is retrieved. 3) Errors related to retrieving the uncompression dictionary are propagated as opposed to silently ignored. Note: the patch temporarily breaks the compression dictionary evicition stats. They will be fixed in a separate phase. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5584 Test Plan: make asan_check Differential Revision: D16344151 Pulled By: ltamasi fbshipit-source-id: 2962b295f5b19628f9da88a3fcebbce5a5017a7b --- CMakeLists.txt | 1 + HISTORY.md | 7 +- TARGETS | 1 + db/db_block_cache_test.cc | 67 +++-- db/version_set.cc | 6 - include/rocksdb/cache.h | 5 - src.mk | 1 + .../block_based_filter_block_test.cc | 7 +- table/block_based/block_based_table_reader.cc | 249 ++++-------------- table/block_based/block_based_table_reader.h | 16 +- table/block_based/full_filter_block_test.cc | 7 +- table/block_based/partitioned_filter_block.cc | 2 +- .../partitioned_filter_block_test.cc | 4 - .../block_based/uncompression_dict_reader.cc | 138 ++++++++++ table/block_based/uncompression_dict_reader.h | 64 +++++ table/table_reader.h | 2 - table/table_test.cc | 170 ------------ util/compression.h | 93 +++++-- 18 files changed, 391 insertions(+), 449 deletions(-) create mode 100644 table/block_based/uncompression_dict_reader.cc create mode 100644 table/block_based/uncompression_dict_reader.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b49a13572bb..0bd7311498f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -601,6 +601,7 @@ set(SOURCES table/block_based/full_filter_block.cc table/block_based/index_builder.cc table/block_based/partitioned_filter_block.cc + table/block_based/uncompression_dict_reader.cc table/block_fetcher.cc table/bloom_block.cc table/cuckoo/cuckoo_table_builder.cc diff --git a/HISTORY.md b/HISTORY.md index 04f194e9258..d452a68a30f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,9 +6,10 @@ ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. -* Index and filter blocks are now handled similarly to data blocks with regards to the block cache: instead of storing reader objects in the cache, only the blocks themselves are cached. In addition, index and filter blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any). +* Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers. * Partitions of partitioned indexes no longer affect the read amplification statistics. -* Due to the above refactoring, block cache eviction statistics for indexes and filters are temporarily broken. We plan to reintroduce them in a later phase. +* Due to the above refactoring, block cache eviction statistics for indexes, filters, and compression dictionaries are temporarily broken. We plan to reintroduce them in a later phase. +* Errors related to the retrieval of the compression dictionary are now propagated to the user. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. @@ -26,6 +27,7 @@ * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. +* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. @@ -35,6 +37,7 @@ * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel * Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. +* The compression dictionary is no longer copied to a new object upon retrieval. ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. diff --git a/TARGETS b/TARGETS index 9246af36361..122da8b542f 100644 --- a/TARGETS +++ b/TARGETS @@ -198,6 +198,7 @@ cpp_library( "table/block_based/full_filter_block.cc", "table/block_based/index_builder.cc", "table/block_based/partitioned_filter_block.cc", + "table/block_based/uncompression_dict_reader.cc", "table/block_fetcher.cc", "table/bloom_block.cc", "table/cuckoo/cuckoo_table_builder.cc", diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 77f37da0d45..422fd83bc20 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -19,6 +19,9 @@ class DBBlockCacheTest : public DBTestBase { size_t hit_count_ = 0; size_t insert_count_ = 0; size_t failure_count_ = 0; + size_t compression_dict_miss_count_ = 0; + size_t compression_dict_hit_count_ = 0; + size_t compression_dict_insert_count_ = 0; size_t compressed_miss_count_ = 0; size_t compressed_hit_count_ = 0; size_t compressed_insert_count_ = 0; @@ -69,6 +72,15 @@ class DBBlockCacheTest : public DBTestBase { TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); } + void RecordCacheCountersForCompressionDict(const Options& options) { + compression_dict_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + compression_dict_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + compression_dict_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + } + void CheckCacheCounters(const Options& options, size_t expected_misses, size_t expected_hits, size_t expected_inserts, size_t expected_failures) { @@ -87,6 +99,28 @@ class DBBlockCacheTest : public DBTestBase { failure_count_ = new_failure_count; } + void CheckCacheCountersForCompressionDict( + const Options& options, size_t expected_compression_dict_misses, + size_t expected_compression_dict_hits, + size_t expected_compression_dict_inserts) { + size_t new_compression_dict_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + size_t new_compression_dict_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + size_t new_compression_dict_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses, + new_compression_dict_miss_count); + ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits, + new_compression_dict_hit_count); + ASSERT_EQ( + compression_dict_insert_count_ + expected_compression_dict_inserts, + new_compression_dict_insert_count); + compression_dict_miss_count_ = new_compression_dict_miss_count; + compression_dict_hit_count_ = new_compression_dict_hit_count; + compression_dict_insert_count_ = new_compression_dict_insert_count; + } + void CheckCompressedCacheCounters(const Options& options, size_t expected_misses, size_t expected_hits, @@ -671,6 +705,8 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { options.table_factory.reset(new BlockBasedTableFactory(table_options)); DestroyAndReopen(options); + RecordCacheCountersForCompressionDict(options); + for (int i = 0; i < kNumFiles; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); for (int j = 0; j < kNumEntriesPerFile; ++j) { @@ -683,27 +719,26 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); + // Compression dictionary blocks are preloaded. + CheckCacheCountersForCompressionDict( + options, kNumFiles /* expected_compression_dict_misses */, + 0 /* expected_compression_dict_hits */, + kNumFiles /* expected_compression_dict_inserts */); + // Seek to a key in a file. It should cause the SST's dictionary meta-block // to be read. RecordCacheCounters(options); - ASSERT_EQ(0, - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD)); - ASSERT_EQ( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); + RecordCacheCountersForCompressionDict(options); ReadOptions read_options; ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); - // Two blocks missed/added: dictionary and data block - // One block hit: index since it's prefetched - CheckCacheCounters(options, 2 /* expected_misses */, 1 /* expected_hits */, - 2 /* expected_inserts */, 0 /* expected_failures */); - ASSERT_EQ(1, - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD)); - ASSERT_GT( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); + // Two block hits: index and dictionary since they are prefetched + // One block missed/added: data block + CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */, + 1 /* expected_inserts */, 0 /* expected_failures */); + CheckCacheCountersForCompressionDict( + options, 0 /* expected_compression_dict_misses */, + 1 /* expected_compression_dict_hits */, + 0 /* expected_compression_dict_inserts */); } } diff --git a/db/version_set.cc b/db/version_set.cc index 281065d0502..7d477a6806b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3420,16 +3420,10 @@ VersionSet::VersionSet(const std::string& dbname, env_options_(storage_options), block_cache_tracer_(block_cache_tracer) {} -void CloseTables(void* ptr, size_t) { - TableReader* table_reader = reinterpret_cast(ptr); - table_reader->Close(); -} - VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet Cache* table_cache = column_family_set_->get_table_cache(); - table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */); column_family_set_.reset(); for (auto& file : obsolete_files_) { if (file.metadata->table_reader_handle) { diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 410c2cf827a..6bde575e0fc 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -250,11 +250,6 @@ class Cache { virtual std::string GetPrintableOptions() const { return ""; } - // Mark the last inserted object as being a raw data block. This will be used - // in tests. The default implementation does nothing. - virtual void TEST_mark_as_data_block(const Slice& /*key*/, - size_t /*charge*/) {} - MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } private: diff --git a/src.mk b/src.mk index 4d635173b89..0f04fc73916 100644 --- a/src.mk +++ b/src.mk @@ -121,6 +121,7 @@ LIB_SOURCES = \ table/block_based/full_filter_block.cc \ table/block_based/index_builder.cc \ table/block_based/partitioned_filter_block.cc \ + table/block_based/uncompression_dict_reader.cc \ table/block_fetcher.cc \ table/bloom_block.cc \ table/cuckoo/cuckoo_table_builder.cc \ diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index 70bbde96ac8..d223dec6e1f 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -45,10 +45,7 @@ class TestHashFilter : public FilterPolicy { class MockBlockBasedTable : public BlockBasedTable { public: explicit MockBlockBasedTable(Rep* rep) - : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { - // Initialize what Open normally does as much as necessary for the test - rep->cache_key_prefix_size = 10; - } + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} }; class FilterBlockTest : public testing::Test { @@ -64,7 +61,6 @@ class FilterBlockTest : public testing::Test { : ioptions_(options_), env_options_(options_), icomp_(options_.comparator) { - table_options_.no_block_cache = true; table_options_.filter_policy.reset(new TestHashFilter); constexpr bool skip_filters = false; @@ -271,7 +267,6 @@ class BlockBasedFilterBlockTest : public testing::Test { : ioptions_(options_), env_options_(options_), icomp_(options_.comparator) { - table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); constexpr bool skip_filters = false; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 000bc295fc1..314763ec3b4 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -63,7 +63,6 @@ extern const std::string kHashIndexPrefixesMetadataBlock; typedef BlockBasedTable::IndexReader IndexReader; BlockBasedTable::~BlockBasedTable() { - Close(); delete rep_; } @@ -148,8 +147,6 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) { delete entry; } -void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); - // Release the cached entry and decrement its ref count. void ForceReleaseCachedEntry(void* arg, void* h) { Cache* cache = reinterpret_cast(arg); @@ -1419,37 +1416,6 @@ Status BlockBasedTable::ReadRangeDelBlock( return s; } -Status BlockBasedTable::ReadCompressionDictBlock( - FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block) const { - assert(compression_dict_block != nullptr); - Status s; - if (!rep_->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_cont{new BlockContents()}; - PersistentCacheOptions cache_options; - ReadOptions read_options; - read_options.verify_checksums = true; - BlockFetcher compression_block_fetcher( - rep_->file.get(), prefetch_buffer, rep_->footer, read_options, - rep_->compression_dict_handle, compression_dict_cont.get(), - rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, - BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(), - cache_options); - s = compression_block_fetcher.ReadBlockContents(); - - if (!s.ok()) { - ROCKS_LOG_WARN( - rep_->ioptions.info_log, - "Encountered error while reading data from compression dictionary " - "block %s", - s.ToString().c_str()); - } else { - *compression_dict_block = std::move(compression_dict_cont); - } - } - return s; -} - Status BlockBasedTable::PrefetchIndexAndFilterBlocks( FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, @@ -1555,23 +1521,16 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } - // TODO(ajkr): also prefetch compression dictionary block - // TODO(ajkr): also pin compression dictionary block when - // `pin_l0_filter_and_index_blocks_in_cache == true`. - if (!table_options.cache_index_and_filter_blocks) { - std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr uncompression_dict_reader; + s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache, + prefetch_all, pin_all, lookup_context, + &uncompression_dict_reader); if (!s.ok()) { return s; } - if (!rep_->compression_dict_handle.IsNull()) { - assert(compression_dict_block != nullptr); - // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - rep_->uncompression_dict.reset(new UncompressionDict( - compression_dict_block->data.ToString(), - rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); - } + rep_->uncompression_dict_reader = std::move(uncompression_dict_reader); } assert(s.ok()); @@ -1609,8 +1568,8 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const { if (rep_->index_reader) { usage += rep_->index_reader->ApproximateMemoryUsage(); } - if (rep_->uncompression_dict) { - usage += rep_->uncompression_dict->ApproximateMemoryUsage(); + if (rep_->uncompression_dict_reader) { + usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage(); } return usage; } @@ -1757,9 +1716,6 @@ Status BlockBasedTable::GetDataBlockFromCache( Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, &DeleteCachedEntry, &cache_handle); -#ifndef NDEBUG - block_cache->TEST_mark_as_data_block(block_cache_key, charge); -#endif // NDEBUG if (s.ok()) { assert(cache_handle != nullptr); block->SetCachedValue(block_holder.release(), block_cache, @@ -1863,9 +1819,6 @@ Status BlockBasedTable::PutDataBlockToCache( s = block_cache->Insert(block_cache_key, block_holder.get(), charge, &DeleteCachedEntry, &cache_handle, priority); -#ifndef NDEBUG - block_cache->TEST_mark_as_data_block(block_cache_key, charge); -#endif // NDEBUG if (s.ok()) { assert(cache_handle != nullptr); cached_block->SetCachedValue(block_holder.release(), block_cache, @@ -1914,86 +1867,6 @@ std::unique_ptr BlockBasedTable::CreateFilterBlockReader( } } -CachableEntry BlockBasedTable::GetUncompressionDict( - FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const { - if (!rep_->table_options.cache_index_and_filter_blocks) { - // block cache is either disabled or not used for meta-blocks. In either - // case, BlockBasedTableReader is the owner of the uncompression dictionary. - return {rep_->uncompression_dict.get(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; - } - if (rep_->compression_dict_handle.IsNull()) { - return CachableEntry(); - } - char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->compression_dict_handle, cache_key_buf); - auto cache_handle = - GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key, - BlockType::kCompressionDictionary, get_context); - UncompressionDict* dict = nullptr; - bool is_cache_hit = false; - size_t usage = 0; - if (cache_handle != nullptr) { - dict = reinterpret_cast( - rep_->table_options.block_cache->Value(cache_handle)); - is_cache_hit = true; - usage = dict->ApproximateMemoryUsage(); - } else if (no_io) { - // Do not invoke any io. - } else { - std::unique_ptr compression_dict_block; - Status s = - ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); - if (s.ok()) { - assert(compression_dict_block != nullptr); - // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - std::unique_ptr uncompression_dict( - new UncompressionDict(compression_dict_block->data.ToString(), - rep_->blocks_definitely_zstd_compressed, - rep_->ioptions.statistics)); - usage = uncompression_dict->ApproximateMemoryUsage(); - s = rep_->table_options.block_cache->Insert( - cache_key, uncompression_dict.get(), usage, - &DeleteCachedUncompressionDictEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - - if (s.ok()) { - UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary, - get_context, usage); - dict = uncompression_dict.release(); - } else { - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); - assert(dict == nullptr); - assert(cache_handle == nullptr); - } - } - } - if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && - lookup_context) { - // Avoid making copy of block_key and cf_name when constructing the access - // record. - BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), - /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock, - /*block_size=*/usage, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io, lookup_context->get_id, - lookup_context->get_from_user_specified_snapshot, - /*referenced_key=*/""); - block_cache_tracer_->WriteBlockAccess(access_record, cache_key, - rep_->cf_name_for_tracing(), - lookup_context->referenced_key); - } - return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, - cache_handle, false /* own_value */}; -} - // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex InternalIteratorBase* BlockBasedTable::NewIndexIterator( @@ -2028,13 +1901,17 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( return iter; } - const bool no_io = (ro.read_tier == kBlockCacheTier); - auto uncompression_dict_storage = - GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context); - const UncompressionDict& uncompression_dict = - uncompression_dict_storage.GetValue() == nullptr - ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.GetValue(); + UncompressionDict uncompression_dict; + if (rep_->uncompression_dict_reader) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + prefetch_buffer, no_io, get_context, lookup_context, + &uncompression_dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + } CachableEntry block; s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, @@ -2268,7 +2145,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { Statistics* statistics = rep_->ioptions.statistics; const bool maybe_compressed = - block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; const bool do_uncompress = maybe_compressed && !block_cache_compressed; CompressionType raw_block_comp_type; BlockContents raw_block_contents; @@ -2321,6 +2200,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( case BlockType::kFilter: trace_block_type = TraceType::kBlockTraceFilterBlock; break; + case BlockType::kCompressionDictionary: + trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; + break; case BlockType::kRangeDeletion: trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; break; @@ -2568,7 +2450,9 @@ Status BlockBasedTable::RetrieveBlock( } const bool maybe_compressed = - block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; const bool do_uncompress = maybe_compressed; std::unique_ptr block; @@ -3504,12 +3388,17 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); - auto uncompression_dict_storage = GetUncompressionDict( - nullptr, no_io, sst_file_range.begin()->get_context, &lookup_context); - const UncompressionDict& uncompression_dict = - uncompression_dict_storage.GetValue() == nullptr - ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.GetValue(); + + UncompressionDict uncompression_dict; + Status uncompression_dict_status; + if (rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, no_io, + sst_file_range.begin()->get_context, &lookup_context, + &uncompression_dict); + } + size_t total_len = 0; ReadOptions ro = read_options; ro.read_tier = kBlockCacheTier; @@ -3535,6 +3424,14 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, sst_file_range.SkipKey(miter); continue; } + + if (!uncompression_dict_status.ok()) { + *(miter->s) = uncompression_dict_status; + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + statuses.emplace_back(); results.emplace_back(); if (v.handle.offset() == offset) { @@ -4191,23 +4088,25 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { } // Output compression dictionary - if (!rep_->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */, - &compression_dict_block); + if (rep_->uncompression_dict_reader) { + UncompressionDict uncompression_dict; + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* get_context */, nullptr /* lookup_context */, + &uncompression_dict); if (!s.ok()) { return s; } - assert(compression_dict_block != nullptr); - auto compression_dict = compression_dict_block->data; + + const Slice& raw_dict = uncompression_dict.GetRawDict(); out_file->Append( "Compression Dictionary:\n" "--------------------------------------\n"); out_file->Append(" size (bytes): "); - out_file->Append(rocksdb::ToString(compression_dict.size())); + out_file->Append(rocksdb::ToString(raw_dict.size())); out_file->Append("\n\n"); out_file->Append(" HEX "); - out_file->Append(compression_dict.ToString(true).c_str()); + out_file->Append(raw_dict.ToString(true).c_str()); out_file->Append("\n\n"); } @@ -4233,29 +4132,6 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { return s; } -void BlockBasedTable::Close() { - if (rep_->closed) { - return; - } - - // cleanup index, filter, and compression dictionary blocks - // to avoid accessing dangling pointers - if (!rep_->table_options.no_block_cache) { - if (!rep_->compression_dict_handle.IsNull()) { - // Get the compression dictionary block key - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->compression_dict_handle, cache_key); - - Cache* const cache = rep_->table_options.block_cache.get(); - cache->Erase(key); - } - } - - rep_->closed = true; -} - Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" @@ -4431,15 +4307,4 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, out_file->Append("\n ------\n"); } -namespace { - -void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { - UncompressionDict* dict = reinterpret_cast(value); - RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, - dict->ApproximateMemoryUsage()); - delete dict; -} - -} // anonymous namespace - } // namespace rocksdb diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 3a16e2995fb..85346d75c72 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -29,6 +29,7 @@ #include "table/block_based/block_type.h" #include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" +#include "table/block_based/uncompression_dict_reader.h" #include "table/format.h" #include "table/get_context.h" #include "table/multiget_context.h" @@ -176,8 +177,6 @@ class BlockBasedTable : public TableReader { Status VerifyChecksum(TableReaderCaller caller) override; - void Close() override; - ~BlockBasedTable(); bool TEST_FilterBlockInCache() const; @@ -242,8 +241,11 @@ class BlockBasedTable : public TableReader { template friend class FilterBlockReaderCommon; + friend class PartitionIndexReader; + friend class UncompressionDictReader; + protected: Rep* rep_; explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) @@ -313,10 +315,6 @@ class BlockBasedTable : public TableReader { CachableEntry, MultiGetContext::MAX_BATCH_SIZE>* results, char* scratch, const UncompressionDict& uncompression_dict) const; - CachableEntry GetUncompressionDict( - FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const; - // Get the iterator from the index reader. // // If input_iter is not set, return a new Iterator. @@ -416,9 +414,6 @@ class BlockBasedTable : public TableReader { InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator, BlockCacheLookupContext* lookup_context); - Status ReadCompressionDictBlock( - FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block) const; Status PrefetchIndexAndFilterBlocks( FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, @@ -514,7 +509,7 @@ struct BlockBasedTable::Rep { std::unique_ptr index_reader; std::unique_ptr filter; - std::unique_ptr uncompression_dict; + std::unique_ptr uncompression_dict_reader; enum class FilterType { kNoFilter, @@ -566,7 +561,6 @@ struct BlockBasedTable::Rep { bool index_key_includes_seq = true; bool index_value_is_full = true; - bool closed = false; const bool immortal_table; SequenceNumber get_global_seqno(BlockType block_type) const { diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index e8fcce07d75..b87db6def94 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -44,10 +44,7 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { class MockBlockBasedTable : public BlockBasedTable { public: explicit MockBlockBasedTable(Rep* rep) - : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { - // Initialize what Open normally does as much as necessary for the test - rep->cache_key_prefix_size = 10; - } + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} }; class TestFilterBitsReader : public FilterBitsReader { @@ -116,7 +113,6 @@ class PluginFullFilterBlockTest : public testing::Test { : ioptions_(options_), env_options_(options_), icomp_(options_.comparator) { - table_options_.no_block_cache = true; table_options_.filter_policy.reset(new TestHashFilter); constexpr bool skip_filters = false; @@ -210,7 +206,6 @@ class FullFilterBlockTest : public testing::Test { : ioptions_(options_), env_options_(options_), icomp_(options_.comparator) { - table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); constexpr bool skip_filters = false; diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index ae57e85dca6..158ed84abee 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -324,7 +324,7 @@ void PartitionedFilterBlockReader::CacheDependencies(bool pin) { prefetch_buffer.reset(new FilePrefetchBuffer()); s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, - static_cast(prefetch_len)); + static_cast(prefetch_len)); // After prefetch, read the partitions one by one ReadOptions read_options; diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 5e9e467723c..aa667afedf0 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -27,7 +27,6 @@ class MockedBlockBasedTable : public BlockBasedTable { MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test - rep->cache_key_prefix_size = 10; rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); rep->index_value_is_full = !pib->get_use_value_delta_encoding(); } @@ -67,9 +66,6 @@ class PartitionedFilterBlockTest env_options_(options_), icomp_(options_.comparator) { table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close - // will access variable that are not - // initialized in our mocked version table_options_.format_version = GetParam(); table_options_.index_block_restart_interval = 3; } diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc new file mode 100644 index 00000000000..d74dbf6c497 --- /dev/null +++ b/table/block_based/uncompression_dict_reader.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/uncompression_dict_reader.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/compression.h" + +namespace rocksdb { + +Status UncompressionDictReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(uncompression_dict_reader); + + CachableEntry uncompression_dict_block; + if (prefetch || !use_cache) { + const Status s = ReadUncompressionDictionaryBlock( + table, prefetch_buffer, ReadOptions(), nullptr /* get_context */, + lookup_context, &uncompression_dict_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + uncompression_dict_block.Reset(); + } + } + + uncompression_dict_reader->reset( + new UncompressionDictReader(table, std::move(uncompression_dict_block))); + + return Status::OK(); +} + +Status UncompressionDictReader::ReadUncompressionDictionaryBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict_block) { + // TODO: add perf counter for compression dictionary read time + + assert(table); + assert(uncompression_dict_block); + assert(uncompression_dict_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + assert(!rep->compression_dict_handle.IsNull()); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->compression_dict_handle, + UncompressionDict::GetEmptyDict(), uncompression_dict_block, + BlockType::kCompressionDictionary, get_context, lookup_context); + + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } + + return s; +} + +Status UncompressionDictReader::GetOrReadUncompressionDictionaryBlock( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict_block) const { + assert(uncompression_dict_block); + + if (!uncompression_dict_block_.IsEmpty()) { + uncompression_dict_block->SetUnownedValue( + uncompression_dict_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadUncompressionDictionaryBlock(table_, prefetch_buffer, read_options, + get_context, lookup_context, + uncompression_dict_block); +} + +Status UncompressionDictReader::GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + UncompressionDict* uncompression_dict) const { + CachableEntry uncompression_dict_block; + const Status s = GetOrReadUncompressionDictionaryBlock( + prefetch_buffer, no_io, get_context, lookup_context, + &uncompression_dict_block); + + if (!s.ok()) { + return s; + } + + assert(uncompression_dict); + assert(table_); + assert(table_->get_rep()); + + UncompressionDict dict(uncompression_dict_block.GetValue()->data, + table_->get_rep()->blocks_definitely_zstd_compressed); + *uncompression_dict = std::move(dict); + uncompression_dict_block.TransferTo(uncompression_dict); + + return Status::OK(); +} + +size_t UncompressionDictReader::ApproximateMemoryUsage() const { + assert(!uncompression_dict_block_.GetOwnValue() || + uncompression_dict_block_.GetValue() != nullptr); + size_t usage = uncompression_dict_block_.GetOwnValue() + ? uncompression_dict_block_.GetValue()->ApproximateMemoryUsage() + : 0; + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + + return usage; +} + +} // namespace rocksdb diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h new file mode 100644 index 00000000000..808149e96b3 --- /dev/null +++ b/table/block_based/uncompression_dict_reader.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include "table/block_based/cachable_entry.h" +#include "table/format.h" + +namespace rocksdb { + +class BlockBasedTable; +struct BlockCacheLookupContext; +class FilePrefetchBuffer; +class GetContext; +struct ReadOptions; +struct UncompressionDict; + +// Provides access to the uncompression dictionary regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class UncompressionDictReader { + public: + static Status Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader); + + Status GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + UncompressionDict* uncompression_dict) const; + + size_t ApproximateMemoryUsage() const; + + private: + UncompressionDictReader( + const BlockBasedTable* t, + CachableEntry&& uncompression_dict_block) + : table_(t), + uncompression_dict_block_(std::move(uncompression_dict_block)) { + assert(table_); + } + + static Status ReadUncompressionDictionaryBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict_block); + + Status GetOrReadUncompressionDictionaryBlock( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict_block) const; + + const BlockBasedTable* table_; + CachableEntry uncompression_dict_block_; +}; + +} // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h index 72d11a7bd24..eb383c8fe8e 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -124,8 +124,6 @@ class TableReader { virtual Status VerifyChecksum(TableReaderCaller /*caller*/) { return Status::NotSupported("VerifyChecksum() not supported"); } - - virtual void Close() {} }; } // namespace rocksdb diff --git a/table/table_test.cc b/table/table_test.cc index bb034311668..6cd26bc732a 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2889,176 +2889,6 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { } } -// A wrapper around LRICache that also keeps track of data blocks (in contrast -// with the objects) in the cache. The class is very simple and can be used only -// for trivial tests. -class MockCache : public LRUCache { - public: - MockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio) - : LRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio) {} - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { - // Replace the deleter with our own so that we keep track of data blocks - // erased from the cache - deleters_[key.ToString()] = deleter; - return ShardedCache::Insert(key, value, charge, &MockDeleter, handle, - priority); - } - // This is called by the application right after inserting a data block - void TEST_mark_as_data_block(const Slice& key, size_t charge) override { - marked_data_in_cache_[key.ToString()] = charge; - marked_size_ += charge; - } - using DeleterFunc = void (*)(const Slice& key, void* value); - static std::map deleters_; - static std::map marked_data_in_cache_; - static size_t marked_size_; - static void MockDeleter(const Slice& key, void* value) { - // If the item was marked for being data block, decrease its usage from the - // total data block usage of the cache - if (marked_data_in_cache_.find(key.ToString()) != - marked_data_in_cache_.end()) { - marked_size_ -= marked_data_in_cache_[key.ToString()]; - } - // Then call the origianl deleter - assert(deleters_.find(key.ToString()) != deleters_.end()); - auto deleter = deleters_[key.ToString()]; - deleter(key, value); - } -}; - -size_t MockCache::marked_size_ = 0; -std::map MockCache::deleters_; -std::map MockCache::marked_data_in_cache_; - -// Block cache can contain raw data blocks as well as general objects. If an -// object depends on the table to be live, it then must be destructed before the -// table is closed. This test makes sure that the only items remains in the -// cache after the table is closed are raw data blocks. -TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) { - std::vector compression_types{kNoCompression}; - - // The following are the compression library versions supporting compression - // dictionaries. See the test case CacheCompressionDict in the - // DBBlockCacheTest suite. -#ifdef ZLIB - compression_types.push_back(kZlibCompression); -#endif // ZLIB -#if LZ4_VERSION_NUMBER >= 10400 - compression_types.push_back(kLZ4Compression); - compression_types.push_back(kLZ4HCCompression); -#endif // LZ4_VERSION_NUMBER >= 10400 -#if ZSTD_VERSION_NUMBER >= 500 - compression_types.push_back(kZSTD); -#endif // ZSTD_VERSION_NUMBER >= 500 - - for (int level: {-1, 0, 1, 10}) { - for (auto index_type : - {BlockBasedTableOptions::IndexType::kBinarySearch, - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) { - for (bool block_based_filter : {true, false}) { - for (bool partition_filter : {true, false}) { - if (partition_filter && - (block_based_filter || - index_type != - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) { - continue; - } - for (bool index_and_filter_in_cache : {true, false}) { - for (bool pin_l0 : {true, false}) { - for (bool pin_top_level : {true, false}) { - if (pin_l0 && !index_and_filter_in_cache) { - continue; - } - - for (auto compression_type : compression_types) { - for (uint32_t max_dict_bytes : {0, 1 << 14}) { - if (compression_type == kNoCompression && max_dict_bytes) - continue; - - // Create a table - Options opt; - std::unique_ptr ikc; - ikc.reset(new test::PlainInternalKeyComparator( - opt.comparator)); - opt.compression = compression_type; - opt.compression_opts.max_dict_bytes = max_dict_bytes; - BlockBasedTableOptions table_options = - GetBlockBasedTableOptions(); - table_options.block_size = 1024; - table_options.index_type = index_type; - table_options.pin_l0_filter_and_index_blocks_in_cache = - pin_l0; - table_options.pin_top_level_index_and_filter = - pin_top_level; - table_options.partition_filters = partition_filter; - table_options.cache_index_and_filter_blocks = - index_and_filter_in_cache; - // big enough so we don't ever lose cached values. - table_options.block_cache = std::make_shared( - 16 * 1024 * 1024, 4, false, 0.0); - table_options.filter_policy.reset( - rocksdb::NewBloomFilterPolicy(10, block_based_filter)); - opt.table_factory.reset(NewBlockBasedTableFactory( - table_options)); - - bool convert_to_internal_key = false; - TableConstructor c(BytewiseComparator(), - convert_to_internal_key, level); - std::string user_key = "k01"; - std::string key = - InternalKey(user_key, 0, kTypeValue).Encode().ToString(); - c.Add(key, "hello"); - std::vector keys; - stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); - const MutableCFOptions moptions(opt); - c.Finish(opt, ioptions, moptions, table_options, *ikc, - &keys, &kvmap); - - // Doing a read to make index/filter loaded into the cache - auto table_reader = - dynamic_cast(c.GetTableReader()); - PinnableSlice value; - GetContext get_context(opt.comparator, nullptr, nullptr, - nullptr, GetContext::kNotFound, user_key, &value, - nullptr, nullptr, nullptr, nullptr); - InternalKey ikey(user_key, 0, kTypeValue); - auto s = table_reader->Get(ReadOptions(), key, &get_context, - moptions.prefix_extractor.get()); - ASSERT_EQ(get_context.State(), GetContext::kFound); - ASSERT_STREQ(value.data(), "hello"); - - // Close the table - c.ResetTableReader(); - - auto usage = table_options.block_cache->GetUsage(); - auto pinned_usage = - table_options.block_cache->GetPinnedUsage(); - // The only usage must be for marked data blocks - ASSERT_EQ(usage, MockCache::marked_size_); - // There must be some pinned data since PinnableSlice has - // not released them yet - ASSERT_GT(pinned_usage, 0); - // Release pinnable slice reousrces - value.Reset(); - pinned_usage = table_options.block_cache->GetPinnedUsage(); - ASSERT_EQ(pinned_usage, 0); - } - } - } - } - } - } - } - } - } // level -} - TEST_P(BlockBasedTableTest, BlockCacheLeak) { // Check that when we reopen a table we don't lose access to blocks already // in the cache. This test checks whether the Table actually makes use of the diff --git a/util/compression.h b/util/compression.h index aa8af74499b..5dbb6c244aa 100644 --- a/util/compression.h +++ b/util/compression.h @@ -21,6 +21,7 @@ #include #include "memory/memory_allocator.h" +#include "rocksdb/cleanable.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "util/coding.h" @@ -216,36 +217,60 @@ struct CompressionDict { // Holds dictionary and related data, like ZSTD's digested uncompression // dictionary. -struct UncompressionDict { +struct UncompressionDict : public Cleanable { + // Block containing the data for the compression dictionary. It is non-empty + // only if the constructor that takes a string parameter is used. + std::string dict_; + + // Slice pointing to the compression dictionary data. Points to + // dict_ if the string constructor is used. In the case of the Slice + // constructor, it is a copy of the Slice passed by the caller. + Slice slice_; + #ifdef ROCKSDB_ZSTD_DDICT - ZSTD_DDict* zstd_ddict_; + // Processed version of the contents of slice_ for ZSTD compression. + ZSTD_DDict* zstd_ddict_ = nullptr; #endif // ROCKSDB_ZSTD_DDICT - // Block containing the data for the compression dictionary. It may be - // redundant with the data held in `zstd_ddict_`. - std::string dict_; - // This `Statistics` pointer is intended to be used upon block cache eviction, - // so only needs to be populated on `UncompressionDict`s that'll be inserted - // into block cache. - Statistics* statistics_; + // Slice constructor: it is the caller's responsibility to either + // a) make sure slice remains valid throughout the lifecycle of this object OR + // b) transfer the management of the underlying resource (e.g. cache handle) + // to this object, in which case UncompressionDict is self-contained, and the + // resource is guaranteed to be released (via the cleanup logic in Cleanable) + // when UncompressionDict is destroyed. #ifdef ROCKSDB_ZSTD_DDICT - UncompressionDict(std::string dict, bool using_zstd, - Statistics* _statistics = nullptr) { + UncompressionDict(Slice slice, bool using_zstd) #else // ROCKSDB_ZSTD_DDICT - UncompressionDict(std::string dict, bool /*using_zstd*/, - Statistics* _statistics = nullptr) { + UncompressionDict(Slice slice, bool /*using_zstd*/) #endif // ROCKSDB_ZSTD_DDICT - dict_ = std::move(dict); - statistics_ = _statistics; + : slice_(std::move(slice)) { #ifdef ROCKSDB_ZSTD_DDICT - zstd_ddict_ = nullptr; - if (!dict_.empty() && using_zstd) { - zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size()); + if (!slice_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size()); assert(zstd_ddict_ != nullptr); } #endif // ROCKSDB_ZSTD_DDICT } + // String constructor: results in a self-contained UncompressionDict. + UncompressionDict(std::string dict, bool using_zstd) + : UncompressionDict(Slice(dict), using_zstd) { + dict_ = std::move(dict); + } + + UncompressionDict(UncompressionDict&& rhs) + : dict_(std::move(rhs.dict_)), + slice_(std::move(rhs.slice_)) +#ifdef ROCKSDB_ZSTD_DDICT + , + zstd_ddict_(rhs.zstd_ddict_) +#endif + { +#ifdef ROCKSDB_ZSTD_DDICT + rhs.zstd_ddict_ = nullptr; +#endif + } + ~UncompressionDict() { #ifdef ROCKSDB_ZSTD_DDICT size_t res = 0; @@ -257,20 +282,34 @@ struct UncompressionDict { #endif // ROCKSDB_ZSTD_DDICT } + UncompressionDict& operator=(UncompressionDict&& rhs) { + if (this == &rhs) { + return *this; + } + + dict_ = std::move(rhs.dict_); + slice_ = std::move(rhs.slice_); + +#ifdef ROCKSDB_ZSTD_DDICT + zstd_ddict_ = rhs.zstd_ddict_; + rhs.zstd_ddict_ = nullptr; +#endif + + return *this; + } + + const Slice& GetRawDict() const { return slice_; } + #ifdef ROCKSDB_ZSTD_DDICT const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; } #endif // ROCKSDB_ZSTD_DDICT - Slice GetRawDict() const { return dict_; } - static const UncompressionDict& GetEmptyDict() { static UncompressionDict empty_dict{}; return empty_dict; } - Statistics* statistics() const { return statistics_; } - - size_t ApproximateMemoryUsage() { + size_t ApproximateMemoryUsage() const { size_t usage = 0; usage += sizeof(struct UncompressionDict); #ifdef ROCKSDB_ZSTD_DDICT @@ -281,11 +320,9 @@ struct UncompressionDict { } UncompressionDict() = default; - // Disable copy/move + // Disable copy UncompressionDict(const CompressionDict&) = delete; UncompressionDict& operator=(const CompressionDict&) = delete; - UncompressionDict(CompressionDict&&) = delete; - UncompressionDict& operator=(CompressionDict&&) = delete; }; class CompressionContext { @@ -725,7 +762,7 @@ inline CacheAllocationPtr Zlib_Uncompress( return nullptr; } - Slice compression_dict = info.dict().GetRawDict(); + const Slice& compression_dict = info.dict().GetRawDict(); if (compression_dict.size()) { // Initialize the compression library's dictionary st = inflateSetDictionary( @@ -1040,7 +1077,7 @@ inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info, auto output = AllocateBlock(output_len, allocator); #if LZ4_VERSION_NUMBER >= 10400 // r124+ LZ4_streamDecode_t* stream = LZ4_createStreamDecode(); - Slice compression_dict = info.dict().GetRawDict(); + const Slice& compression_dict = info.dict().GetRawDict(); if (compression_dict.size()) { LZ4_setStreamDecode(stream, compression_dict.data(), static_cast(compression_dict.size())); From cfcf045accbc5d682a02f4acb1192a7f54f05f1f Mon Sep 17 00:00:00 2001 From: Mark Rambacher Date: Tue, 23 Jul 2019 17:08:26 -0700 Subject: [PATCH 249/572] =?UTF-8?q?The=20ObjectRegistry=20class=20replaces?= =?UTF-8?q?=20the=20Registrar=20and=20NewCustomObjects.=E2=80=A6=20(#5293)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: The ObjectRegistry class replaces the Registrar and NewCustomObjects. Objects are registered with the registry by Type (the class must implement the static const char *Type() method). This change is necessary for a few reasons: - By having a class (rather than static template instances), the class can be passed between compilation units, meaning that objects could be registered and shared from a dynamic library with an executable. - By having a class with instances, different units could have different objects registered. This could be useful if, for example, one Option allowed for a dynamic library and one did not. When combined with some other PRs (being able to load shared libraries, a Configurable interface to configure objects to/from string), this code will allow objects in external shared libraries to be added to a RocksDB image at run-time, rather than requiring every new extension to be built into the main library and called explicitly by every program. Test plan (on riversand963's devserver) ``` $COMPILE_WITH_ASAN=1 make -j32 all && sleep 1 && make check ``` All tests pass. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5293 Differential Revision: D16363396 Pulled By: riversand963 fbshipit-source-id: fbe4acb615bfc11103eef40a0b288845791c0180 --- CMakeLists.txt | 3 +- HISTORY.md | 1 + TARGETS | 26 ++- env/env.cc | 15 ++ env/env_basic_test.cc | 4 +- include/rocksdb/comparator.h | 1 + include/rocksdb/env.h | 5 + include/rocksdb/merge_operator.h | 1 + include/rocksdb/statistics.h | 2 +- include/rocksdb/utilities/object_registry.h | 225 +++++++++++++++----- options/options_helper.cc | 26 +-- options/options_test.cc | 33 +-- src.mk | 1 + tools/block_cache_trace_analyzer.cc | 2 +- tools/db_bench_tool.cc | 17 +- tools/ldb_cmd.cc | 11 +- utilities/object_registry.cc | 87 ++++++++ utilities/object_registry_test.cc | 137 ++++++++++-- 18 files changed, 465 insertions(+), 132 deletions(-) create mode 100644 utilities/object_registry.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bd7311498f..086975f3e8f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -520,7 +520,7 @@ set(SOURCES db/flush_job.cc db/flush_scheduler.cc db/forward_iterator.cc - db/import_column_family_job.cc + db/import_column_family_job.cc db/internal_stats.cc db/logs_with_prep_tracker.cc db/log_reader.cc @@ -681,6 +681,7 @@ set(SOURCES utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/string_append/stringappend2.cc utilities/merge_operators/uint64add.cc + utilities/object_registry.cc utilities/option_change_migration/option_change_migration.cc utilities/options/options_util.cc utilities/persistent_cache/block_cache_tier.cc diff --git a/HISTORY.md b/HISTORY.md index d452a68a30f..59205341020 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -20,6 +20,7 @@ * Overload GetAllKeyVersions() to support non-default column family. * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. +* Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env. ### New Features * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/TARGETS b/TARGETS index 122da8b542f..ba6f96c0b5f 100644 --- a/TARGETS +++ b/TARGETS @@ -276,6 +276,7 @@ cpp_library( "utilities/merge_operators/string_append/stringappend.cc", "utilities/merge_operators/string_append/stringappend2.cc", "utilities/merge_operators/uint64add.cc", + "utilities/object_registry.cc", "utilities/option_change_migration/option_change_migration.cc", "utilities/options/options_util.cc", "utilities/persistent_cache/block_cache_tier.cc", @@ -371,11 +372,6 @@ ROCKS_TESTS = [ "logging/auto_roll_logger_test.cc", "serial", ], - [ - "env_logger_test", - "logging/env_logger_test.cc", - "serial", - ], [ "autovector_test", "util/autovector_test.cc", @@ -422,13 +418,13 @@ ROCKS_TESTS = [ "serial", ], [ - "cache_test", - "cache/cache_test.cc", + "cache_simulator_test", + "utilities/simulator_cache/cache_simulator_test.cc", "serial", ], [ - "cache_simulator_test", - "utilities/simulator_cache/cache_simulator_test.cc", + "cache_test", + "cache/cache_test.cc", "serial", ], [ @@ -554,7 +550,7 @@ ROCKS_TESTS = [ [ "db_bloom_filter_test", "db/db_bloom_filter_test.cc", - "parallel", + "serial", ], [ "db_compaction_filter_test", @@ -711,6 +707,11 @@ ROCKS_TESTS = [ "env/env_basic_test.cc", "serial", ], + [ + "env_logger_test", + "logging/env_logger_test.cc", + "serial", + ], [ "env_test", "env/env_test.cc", @@ -796,6 +797,11 @@ ROCKS_TESTS = [ "monitoring/histogram_test.cc", "serial", ], + [ + "import_column_family_test", + "db/import_column_family_test.cc", + "parallel", + ], [ "inlineskiplist_test", "memtable/inlineskiplist_test.cc", diff --git a/env/env.cc b/env/env.cc index 87b6b35c16c..4c222cfc19e 100644 --- a/env/env.cc +++ b/env/env.cc @@ -16,6 +16,7 @@ #include "port/port.h" #include "port/sys_time.h" #include "rocksdb/options.h" +#include "rocksdb/utilities/object_registry.h" #include "util/autovector.h" namespace rocksdb { @@ -28,6 +29,20 @@ Status Env::NewLogger(const std::string& fname, return NewEnvLogger(fname, this, result); } +Status Env::LoadEnv(const std::string& value, Env** result) { + Env* env = *result; + Status s; +#ifndef ROCKSDB_LITE + s = ObjectRegistry::NewInstance()->NewStaticObject(value, &env); +#else + s = Status::NotSupported("Cannot load environment in LITE mode: ", value); +#endif + if (s.ok()) { + *result = env; + } + return s; +} + std::string Env::PriorityToString(Env::Priority priority) { switch (priority) { case Env::Priority::BOTTOM: diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index f306edbd6ba..c955bdb7141 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -11,7 +11,6 @@ #include "env/mock_env.h" #include "rocksdb/env.h" -#include "rocksdb/utilities/object_registry.h" #include "test_util/testharness.h" namespace rocksdb { @@ -104,13 +103,12 @@ namespace { // ValuesIn() will skip running tests when given an empty collection. std::vector GetCustomEnvs() { static Env* custom_env; - static std::unique_ptr custom_env_guard; static bool init = false; if (!init) { init = true; const char* uri = getenv("TEST_ENV_URI"); if (uri != nullptr) { - custom_env = NewCustomObject(uri, &custom_env_guard); + Env::LoadEnv(uri, &custom_env); } } diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 9f262367d11..e30a9d01459 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -35,6 +35,7 @@ class Comparator { virtual ~Comparator() {} + static const char* Type() { return "Comparator"; } // Three-way comparison. Returns value: // < 0 iff "a" < "b", // == 0 iff "a" == "b", diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 126f25747ff..398a7ff511d 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -144,6 +144,11 @@ class Env { virtual ~Env(); + static const char* Type() { return "Environment"; } + + // Loads the environment specified by the input value into the result + static Status LoadEnv(const std::string& value, Env** result); + // Return a default environment suitable for the current operating // system. Sophisticated users may wish to provide their own Env // implementation instead of relying on this default environment. diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index d8ddcc6a097..36f47e254ed 100644 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -46,6 +46,7 @@ class Logger; class MergeOperator { public: virtual ~MergeOperator() {} + static const char* Type() { return "MergeOperator"; } // Gives the client a way to express the read -> modify -> write semantics // key: (IN) The key that's associated with this merge operation. diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 653b460cbdd..a8d01e03415 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -480,7 +480,7 @@ enum StatsLevel : uint8_t { class Statistics { public: virtual ~Statistics() {} - + static const char* Type() { return "Statistics"; } virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; virtual void histogramData(uint32_t type, HistogramData* const data) const = 0; diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h index 86a51b92ead..d1516079a61 100644 --- a/include/rocksdb/utilities/object_registry.h +++ b/include/rocksdb/utilities/object_registry.h @@ -11,80 +11,195 @@ #include #include #include +#include #include - -#include "rocksdb/env.h" +#include "rocksdb/status.h" namespace rocksdb { - -// Creates a new T using the factory function that was registered with a pattern -// that matches the provided "target" string according to std::regex_match. -// -// If no registered functions match, returns nullptr. If multiple functions -// match, the factory function used is unspecified. -// -// Populates res_guard with result pointer if caller is granted ownership. -template -T* NewCustomObject(const std::string& target, std::unique_ptr* res_guard); - +class Logger; // Returns a new T when called with a string. Populates the std::unique_ptr // argument if granting ownership to caller. template -using FactoryFunc = std::function*)>; - -// To register a factory function for a type T, initialize a Registrar object -// with static storage duration. For example: -// -// static Registrar hdfs_reg("hdfs://.*", &CreateHdfsEnv); -// -// Then, calling NewCustomObject("hdfs://some_path", ...) will match the -// regex provided above, so it returns the result of invoking CreateHdfsEnv. -template -class Registrar { +using FactoryFunc = + std::function*, std::string*)>; + +class ObjectLibrary { public: - explicit Registrar(std::string pattern, FactoryFunc factory); -}; + // Base class for an Entry in the Registry. + class Entry { + public: + virtual ~Entry() {} + Entry(const std::string& name) : name_(std::move(name)) {} + + // Checks to see if the target matches this entry + virtual bool matches(const std::string& target) const { + return name_ == target; + } + const std::string& Name() const { return name_; } + + private: + const std::string name_; // The name of the Entry + }; // End class Entry + + // An Entry containing a FactoryFunc for creating new Objects + template + class FactoryEntry : public Entry { + public: + FactoryEntry(const std::string& name, FactoryFunc f) + : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {} + ~FactoryEntry() override {} + bool matches(const std::string& target) const override { + return std::regex_match(target, pattern_); + } + // Creates a new T object. + T* NewFactoryObject(const std::string& target, std::unique_ptr* guard, + std::string* msg) const { + return factory_(target, guard, msg); + } -// Implementation details follow. + private: + std::regex pattern_; // The pattern for this entry + FactoryFunc factory_; + }; // End class FactoryEntry + public: + // Finds the entry matching the input name and type + const Entry* FindEntry(const std::string& type, + const std::string& name) const; + void Dump(Logger* logger) const; + + // Registers the factory with the library for the pattern. + // If the pattern matches, the factory may be used to create a new object. + template + const FactoryFunc& Register(const std::string& pattern, + const FactoryFunc& factory) { + std::unique_ptr entry(new FactoryEntry(pattern, factory)); + AddEntry(T::Type(), entry); + return factory; + } + // Returns the default ObjectLibrary + static std::shared_ptr& Default(); -namespace internal { + private: + // Adds the input entry to the list for the given type + void AddEntry(const std::string& type, std::unique_ptr& entry); -template -struct RegistryEntry { - std::regex pattern; - FactoryFunc factory; + // ** FactoryFunctions for this loader, organized by type + std::unordered_map>> entries_; }; -template -struct Registry { - static Registry* Get() { - static Registry instance; - return &instance; +// The ObjectRegistry is used to register objects that can be created by a +// name/pattern at run-time where the specific implementation of the object may +// not be known in advance. +class ObjectRegistry { + public: + static std::shared_ptr NewInstance(); + + ObjectRegistry(); + + void AddLibrary(const std::shared_ptr& library) { + libraries_.emplace_back(library); } - std::vector> entries; - private: - Registry() = default; -}; + // Creates a new T using the factory function that was registered with a + // pattern that matches the provided "target" string according to + // std::regex_match. + // + // If no registered functions match, returns nullptr. If multiple functions + // match, the factory function used is unspecified. + // + // Populates res_guard with result pointer if caller is granted ownership. + template + T* NewObject(const std::string& target, std::unique_ptr* guard, + std::string* errmsg) { + guard->reset(); + const auto* basic = FindEntry(T::Type(), target); + if (basic != nullptr) { + const auto* factory = + static_cast*>(basic); + return factory->NewFactoryObject(target, guard, errmsg); + } else { + *errmsg = std::string("Could not load ") + T::Type(); + return nullptr; + } + } + + // Creates a new unique T using the input factory functions. + // Returns OK if a new unique T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a unique ptr) + template + Status NewUniqueObject(const std::string& target, + std::unique_ptr* result) { + std::string errmsg; + T* ptr = NewObject(target, result, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (*result) { + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a unique ") + + T::Type() + " from unguarded one ", + target); + } + } -} // namespace internal + // Creates a new shared T using the input factory functions. + // Returns OK if a new shared T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a shared ptr) + template + Status NewSharedObject(const std::string& target, + std::shared_ptr* result) { + std::string errmsg; + std::unique_ptr guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a shared ") + + T::Type() + " from unguarded one ", + target); + } + } -template -T* NewCustomObject(const std::string& target, std::unique_ptr* res_guard) { - res_guard->reset(); - for (const auto& entry : internal::Registry::Get()->entries) { - if (std::regex_match(target, entry.pattern)) { - return entry.factory(target, res_guard); + // Creates a new static T using the input factory functions. + // Returns OK if a new static T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return a guarded object + // (meaning it is managed by a unique ptr) + template + Status NewStaticObject(const std::string& target, T** result) { + std::string errmsg; + std::unique_ptr guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard.get()) { + return Status::InvalidArgument(std::string("Cannot make a static ") + + T::Type() + " from a guarded one ", + target); + } else { + *result = ptr; + return Status::OK(); } } - return nullptr; -} -template -Registrar::Registrar(std::string pattern, FactoryFunc factory) { - internal::Registry::Get()->entries.emplace_back(internal::RegistryEntry{ - std::regex(std::move(pattern)), std::move(factory)}); -} + // Dump the contents of the registry to the logger + void Dump(Logger* logger) const; + + private: + const ObjectLibrary::Entry* FindEntry(const std::string& type, + const std::string& name) const; + // The set of libraries to search for factories for this registry. + // The libraries are searched in reverse order (back to front) when + // searching for entries. + std::vector> libraries_; +}; } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/options/options_helper.cc b/options/options_helper.cc index 922ece3a81a..5733ceed455 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -1045,21 +1045,21 @@ Status ParseColumnFamilyOption(const std::string& name, } else { if (name == kNameComparator) { // Try to get comparator from object registry first. - std::unique_ptr comp_guard; - const Comparator* comp = - NewCustomObject(value, &comp_guard); // Only support static comparator for now. - if (comp != nullptr && !comp_guard) { - new_options->comparator = comp; + Status status = ObjectRegistry::NewInstance()->NewStaticObject( + value, &new_options->comparator); + if (status.ok()) { + return status; } } else if (name == kNameMergeOperator) { // Try to get merge operator from object registry first. - std::unique_ptr> mo_guard; - std::shared_ptr* mo = - NewCustomObject>(value, &mo_guard); + std::shared_ptr mo; + Status status = + ObjectRegistry::NewInstance()->NewSharedObject( + value, &new_options->merge_operator); // Only support static comparator for now. - if (mo != nullptr) { - new_options->merge_operator = *mo; + if (status.ok()) { + return status; } } @@ -1191,10 +1191,10 @@ Status ParseDBOption(const std::string& name, NewGenericRateLimiter(static_cast(ParseUint64(value)))); } else if (name == kNameEnv) { // Currently `Env` can be deserialized from object registry only. - std::unique_ptr env_guard; - Env* env = NewCustomObject(value, &env_guard); + Env* env = new_options->env; + Status status = Env::LoadEnv(value, &env); // Only support static env for now. - if (env != nullptr && !env_guard) { + if (status.ok()) { new_options->env = env; } } else { diff --git a/options/options_test.cc b/options/options_test.cc index 823a9c1e054..05ea766f6a6 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -341,11 +341,11 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { // Comparator from object registry std::string kCompName = "reverse_comp"; - static Registrar test_reg_a( - kCompName, [](const std::string& /*name*/, - std::unique_ptr* /*comparator_guard*/) { - return ReverseBytewiseComparator(); - }); + ObjectLibrary::Default()->Register( + kCompName, + [](const std::string& /*name*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); ASSERT_OK(GetColumnFamilyOptionsFromString( base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt)); @@ -354,13 +354,12 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { // MergeOperator from object registry std::unique_ptr bxo(new BytesXOROperator()); std::string kMoName = bxo->Name(); - static Registrar> test_reg_b( - kMoName, [](const std::string& /*name*/, - std::unique_ptr>* - merge_operator_guard) { - merge_operator_guard->reset( - new std::shared_ptr(new BytesXOROperator())); - return merge_operator_guard->get(); + ObjectLibrary::Default()->Register( + kMoName, + [](const std::string& /*name*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new BytesXOROperator()); + return guard->get(); }); ASSERT_OK(GetColumnFamilyOptionsFromString( @@ -770,9 +769,10 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} }; - static Registrar test_reg_env( + ObjectLibrary::Default()->Register( kCustomEnvName, - [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/) { + [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { static CustomEnv env(Env::Default()); return &env; }); @@ -813,8 +813,9 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.create_if_missing, true); ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); - std::unique_ptr env_guard; - ASSERT_EQ(NewCustomObject(kCustomEnvName, &env_guard), new_options.env); + Env* newEnv = new_options.env; + ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv)); + ASSERT_EQ(newEnv, new_options.env); } TEST_F(OptionsTest, DBOptionsSerialization) { diff --git a/src.mk b/src.mk index 0f04fc73916..3462a6a58bb 100644 --- a/src.mk +++ b/src.mk @@ -195,6 +195,7 @@ LIB_SOURCES = \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/uint64add.cc \ utilities/merge_operators/bytesxor.cc \ + utilities/object_registry.cc \ utilities/option_change_migration/option_change_migration.cc \ utilities/options/options_util.cc \ utilities/persistent_cache/block_cache_tier.cc \ diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 08143ebcf88..761395a6654 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -1637,7 +1637,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only, } fprintf(stdout, "Bottom %" PRIu32 " access count. Access count=%" PRIu64 - " nblocks=%" PRIu64 " %s\n", + " nblocks=%" ROCKSDB_PRIszt " %s\n", bottom_k, naccess_it->first, naccess_it->second.size(), statistics.c_str()); } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 39f9eebc7e0..f6a9d945897 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -3049,8 +3049,9 @@ class Benchmark { std::shared_ptr timestamp_emulator_; std::unique_ptr secondary_update_thread_; std::atomic secondary_update_stopped_{0}; +#ifndef ROCKSDB_LITE uint64_t secondary_db_updates_ = 0; - +#endif // ROCKSDB_LITE struct ThreadArg { Benchmark* bm; SharedState* shared; @@ -6366,13 +6367,12 @@ int db_bench_tool(int argc, char** argv) { exit(1); } if (!FLAGS_statistics_string.empty()) { - std::unique_ptr custom_stats_guard; - dbstats.reset(NewCustomObject(FLAGS_statistics_string, - &custom_stats_guard)); - custom_stats_guard.release(); + Status s = ObjectRegistry::NewInstance()->NewSharedObject( + FLAGS_statistics_string, &dbstats); if (dbstats == nullptr) { - fprintf(stderr, "No Statistics registered matching string: %s\n", - FLAGS_statistics_string.c_str()); + fprintf(stderr, + "No Statistics registered matching string: %s status=%s\n", + FLAGS_statistics_string.c_str(), s.ToString().c_str()); exit(1); } } @@ -6400,12 +6400,11 @@ int db_bench_tool(int argc, char** argv) { StringToCompressionType(FLAGS_compression_type.c_str()); #ifndef ROCKSDB_LITE - std::unique_ptr custom_env_guard; if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) { fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n"); exit(1); } else if (!FLAGS_env_uri.empty()) { - FLAGS_env = NewCustomObject(FLAGS_env_uri, &custom_env_guard); + Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env); if (FLAGS_env == nullptr) { fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str()); exit(1); diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 338f09fb992..86dfcc54e9e 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -20,7 +20,6 @@ #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/debug.h" -#include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_util.h" #include "rocksdb/write_batch.h" #include "rocksdb/write_buffer_manager.h" @@ -2854,8 +2853,9 @@ void BackupCommand::DoCommand() { return; } printf("open db OK\n"); - std::unique_ptr custom_env_guard; - Env* custom_env = NewCustomObject(backup_env_uri_, &custom_env_guard); + Env* custom_env = nullptr; + Env::LoadEnv(backup_env_uri_, &custom_env); + BackupableDBOptions backup_options = BackupableDBOptions(backup_dir_, custom_env); backup_options.info_log = logger_.get(); @@ -2889,8 +2889,9 @@ void RestoreCommand::Help(std::string& ret) { } void RestoreCommand::DoCommand() { - std::unique_ptr custom_env_guard; - Env* custom_env = NewCustomObject(backup_env_uri_, &custom_env_guard); + Env* custom_env = nullptr; + Env::LoadEnv(backup_env_uri_, &custom_env); + std::unique_ptr restore_engine; Status status; { diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc new file mode 100644 index 00000000000..3706e791e00 --- /dev/null +++ b/utilities/object_registry.cc @@ -0,0 +1,87 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/object_registry.h" + +#include "logging/logging.h" +#include "rocksdb/env.h" + +namespace rocksdb { +#ifndef ROCKSDB_LITE +// Looks through the "type" factories for one that matches "name". +// If found, returns the pointer to the Entry matching this name. +// Otherwise, nullptr is returned +const ObjectLibrary::Entry *ObjectLibrary::FindEntry( + const std::string &type, const std::string &name) const { + auto entries = entries_.find(type); + if (entries != entries_.end()) { + for (const auto &entry : entries->second) { + if (entry->matches(name)) { + return entry.get(); + } + } + } + return nullptr; +} + +void ObjectLibrary::AddEntry(const std::string &type, + std::unique_ptr &entry) { + auto &entries = entries_[type]; + entries.emplace_back(std::move(entry)); +} + +void ObjectLibrary::Dump(Logger *logger) const { + for (const auto &iter : entries_) { + ROCKS_LOG_HEADER(logger, " Registered factories for type[%s] ", + iter.first.c_str()); + bool printed_one = false; + for (const auto &e : iter.second) { + ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', + e->Name().c_str()); + printed_one = true; + } + } + ROCKS_LOG_HEADER(logger, "\n"); +} + +// Returns the Default singleton instance of the ObjectLibrary +// This instance will contain most of the "standard" registered objects +std::shared_ptr &ObjectLibrary::Default() { + static std::shared_ptr instance = + std::make_shared(); + return instance; +} + +std::shared_ptr ObjectRegistry::NewInstance() { + std::shared_ptr instance = std::make_shared(); + return instance; +} + +ObjectRegistry::ObjectRegistry() { + libraries_.push_back(ObjectLibrary::Default()); +} + +// Searches (from back to front) the libraries looking for the +// an entry that matches this pattern. +// Returns the entry if it is found, and nullptr otherwise +const ObjectLibrary::Entry *ObjectRegistry::FindEntry( + const std::string &type, const std::string &name) const { + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) { + const auto *entry = iter->get()->FindEntry(type, name); + if (entry != nullptr) { + return entry; + } + } + return nullptr; +} + +void ObjectRegistry::Dump(Logger *logger) const { + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) { + iter->get()->Dump(logger); + } +} + +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc index cc7c38d8a65..826931845dc 100644 --- a/utilities/object_registry_test.cc +++ b/utilities/object_registry_test.cc @@ -17,44 +17,145 @@ class EnvRegistryTest : public testing::Test { int EnvRegistryTest::num_a = 0; int EnvRegistryTest::num_b = 0; +static FactoryFunc test_reg_a = ObjectLibrary::Default()->Register( + "a://.*", + [](const std::string& /*uri*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { + ++EnvRegistryTest::num_a; + return Env::Default(); + }); -static Registrar test_reg_a("a://.*", - [](const std::string& /*uri*/, - std::unique_ptr* /*env_guard*/) { - ++EnvRegistryTest::num_a; - return Env::Default(); - }); - -static Registrar test_reg_b("b://.*", [](const std::string& /*uri*/, - std::unique_ptr* env_guard) { - ++EnvRegistryTest::num_b; - // Env::Default() is a singleton so we can't grant ownership directly to the - // caller - we must wrap it first. - env_guard->reset(new EnvWrapper(Env::Default())); - return env_guard->get(); -}); +static FactoryFunc test_reg_b = ObjectLibrary::Default()->Register( + "b://.*", [](const std::string& /*uri*/, std::unique_ptr* env_guard, + std::string* /* errmsg */) { + ++EnvRegistryTest::num_b; + // Env::Default() is a singleton so we can't grant ownership directly to + // the caller - we must wrap it first. + env_guard->reset(new EnvWrapper(Env::Default())); + return env_guard->get(); + }); TEST_F(EnvRegistryTest, Basics) { + std::string msg; std::unique_ptr env_guard; - auto res = NewCustomObject("a://test", &env_guard); + auto registry = ObjectRegistry::NewInstance(); + auto res = registry->NewObject("a://test", &env_guard, &msg); ASSERT_NE(res, nullptr); ASSERT_EQ(env_guard, nullptr); ASSERT_EQ(1, num_a); ASSERT_EQ(0, num_b); - res = NewCustomObject("b://test", &env_guard); + res = registry->NewObject("b://test", &env_guard, &msg); ASSERT_NE(res, nullptr); ASSERT_NE(env_guard, nullptr); ASSERT_EQ(1, num_a); ASSERT_EQ(1, num_b); - res = NewCustomObject("c://test", &env_guard); + res = registry->NewObject("c://test", &env_guard, &msg); ASSERT_EQ(res, nullptr); ASSERT_EQ(env_guard, nullptr); ASSERT_EQ(1, num_a); ASSERT_EQ(1, num_b); } +TEST_F(EnvRegistryTest, LocalRegistry) { + std::string msg; + std::unique_ptr guard; + auto registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = std::make_shared(); + registry->AddLibrary(library); + library->Register( + "test-local", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + ObjectLibrary::Default()->Register( + "test-global", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + ASSERT_EQ( + ObjectRegistry::NewInstance()->NewObject("test-local", &guard, &msg), + nullptr); + ASSERT_NE( + ObjectRegistry::NewInstance()->NewObject("test-global", &guard, &msg), + nullptr); + ASSERT_NE(registry->NewObject("test-local", &guard, &msg), nullptr); + ASSERT_NE(registry->NewObject("test-global", &guard, &msg), nullptr); +} + +TEST_F(EnvRegistryTest, CheckShared) { + std::shared_ptr shared; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = std::make_shared(); + registry->AddLibrary(library); + library->Register( + "unguarded", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + library->Register( + "guarded", [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new EnvWrapper(Env::Default())); + return guard->get(); + }); + + ASSERT_OK(registry->NewSharedObject("guarded", &shared)); + ASSERT_NE(shared, nullptr); + shared.reset(); + ASSERT_NOK(registry->NewSharedObject("unguarded", &shared)); + ASSERT_EQ(shared, nullptr); +} + +TEST_F(EnvRegistryTest, CheckStatic) { + Env* env = nullptr; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = std::make_shared(); + registry->AddLibrary(library); + library->Register( + "unguarded", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + library->Register( + "guarded", [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new EnvWrapper(Env::Default())); + return guard->get(); + }); + + ASSERT_NOK(registry->NewStaticObject("guarded", &env)); + ASSERT_EQ(env, nullptr); + env = nullptr; + ASSERT_OK(registry->NewStaticObject("unguarded", &env)); + ASSERT_NE(env, nullptr); +} + +TEST_F(EnvRegistryTest, CheckUnique) { + std::unique_ptr unique; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = std::make_shared(); + registry->AddLibrary(library); + library->Register( + "unguarded", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + library->Register( + "guarded", [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new EnvWrapper(Env::Default())); + return guard->get(); + }); + + ASSERT_OK(registry->NewUniqueObject("guarded", &unique)); + ASSERT_NE(unique, nullptr); + unique.reset(); + ASSERT_NOK(registry->NewUniqueObject("unguarded", &unique)); + ASSERT_EQ(unique, nullptr); +} + } // namespace rocksdb int main(int argc, char** argv) { From f5b951f7b6b223f0373bec2d935a0a3a68c17d32 Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 23 Jul 2019 19:34:56 -0700 Subject: [PATCH 250/572] Fix wrong info log printing for num_range_deletions (#5617) Summary: num_range_deletions printing is wrong in this log line: 2019/07/18-12:59:15.309271 7f869f9ff700 EVENT_LOG_v1 {"time_micros": 1563479955309228, "cf_name": "5", "job": 955, "event": "table_file_creation", "file_number": 34579, "file_size": 2239842, "table_properties": {"data_size": 1988792, "index_size": 3067, "index_partitions": 0, "top_level_index_size": 0, "index_key_is_user_key": 0, "index_value_is_delta_encoded": 1, "filter_size": 170821, "raw_key_size": 1951792, "raw_average_key_size": 16, "raw_value_size": 1731720, "raw_average_value_size": 14, "num_data_blocks": 199, "num_entries": 121987, "num_deletions": 15184, "num_merge_operands": 86512, "num_range_deletions": 86512, "format_version": 0, "fixed_key_len": 0, "filter_policy": "rocksdb.BuiltinBloomFilter", "column_family_name": "5", "column_family_id": 5, "comparator": "leveldb.BytewiseComparator", "merge_operator": "PutOperator", "prefix_extractor_name": "rocksdb.FixedPrefix.7", "property_collectors": "[]", "compression": "ZSTD", "compression_options": "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; ", "creation_time": 1563479951, "oldest_key_time": 0, "file_creation_time": 1563479954}} It actually prints "num_merge_operands" number. Fix it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5617 Test Plan: Just build. Differential Revision: D16453110 fbshipit-source-id: fc1024b3cd5650312ed47a1379f0d2cf8b2d8a8f --- db/event_helpers.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/event_helpers.cc b/db/event_helpers.cc index f1b4b6417ed..4c38ad31400 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -106,7 +106,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << "num_entries" << table_properties.num_entries << "num_deletions" << table_properties.num_deletions << "num_merge_operands" << table_properties.num_merge_operands - << "num_range_deletions" << table_properties.num_merge_operands + << "num_range_deletions" << table_properties.num_range_deletions << "format_version" << table_properties.format_version << "fixed_key_len" << table_properties.fixed_key_len << "filter_policy" << table_properties.filter_policy_name From 66b524a9112bfe12d8e43cfb69e5ab7a65c9a950 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Wed, 24 Jul 2019 10:21:18 -0700 Subject: [PATCH 251/572] Simplify WriteUnpreparedTxnReadCallback and fix some comments (#5621) Summary: Simplify WriteUnpreparedTxnReadCallback so we just have one function `CalcMaxVisibleSeq`. Also, there's no need for the read callback to hold onto the transaction any more, so just hold the set of unprep_seqs, reducing about of indirection in `IsVisibleFullCheck`. Also, some comments about using transaction snapshot were out of date, so remove them. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5621 Differential Revision: D16459883 Pulled By: lth fbshipit-source-id: cd581323fd18982e817d99af57b6eaba59e599bb --- .../transactions/write_unprepared_txn.cc | 15 ++------- utilities/transactions/write_unprepared_txn.h | 31 ++++++++++--------- .../transactions/write_unprepared_txn_db.cc | 30 ++++++++---------- 3 files changed, 32 insertions(+), 44 deletions(-) diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 4d1401b3aa1..9265c3d4afb 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -13,15 +13,13 @@ namespace rocksdb { bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { - auto unprep_seqs = txn_->GetUnpreparedSequenceNumbers(); - // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is // in unprep_seqs, we have to check if seq is equal to prep_seq or any of // the prepare_batch_cnt seq nums after it. // // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is // large. - for (const auto& it : unprep_seqs) { + for (const auto& it : unprep_seqs_) { if (it.first <= seq && seq < it.first + it.second) { return true; } @@ -30,15 +28,6 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_); } -SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber( - WriteUnpreparedTxn* txn) { - const auto& unprep_seqs = txn->GetUnpreparedSequenceNumbers(); - if (unprep_seqs.size()) { - return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; - } - return 0; -} - WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, const WriteOptions& write_options, const TransactionOptions& txn_options) @@ -537,7 +526,7 @@ Status WriteUnpreparedTxn::Get(const ReadOptions& options, const bool backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, - this); + unprep_seqs_); auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, value, &callback); if (LIKELY(wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index b64fd81e611..d81c30217df 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -53,17 +53,17 @@ class WriteUnpreparedTxn; // class WriteUnpreparedTxnReadCallback : public ReadCallback { public: - WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db, - SequenceNumber snapshot, - SequenceNumber min_uncommitted, - WriteUnpreparedTxn* txn) + WriteUnpreparedTxnReadCallback( + WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + const std::map& unprep_seqs) // Pass our last uncommitted seq as the snapshot to the parent class to // ensure that the parent will not prematurely filter out own writes. We // will do the exact comparison against snapshots in IsVisibleFullCheck // override. - : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted), + : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted), db_(db), - txn_(txn), + unprep_seqs_(unprep_seqs), wup_snapshot_(snapshot) {} virtual bool IsVisibleFullCheck(SequenceNumber seq) override; @@ -74,15 +74,18 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { } private: - static SequenceNumber CalcMaxVisibleSeq(WriteUnpreparedTxn* txn, - SequenceNumber snapshot_seq) { - SequenceNumber max_unprepared = CalcMaxUnpreparedSequenceNumber(txn); + static SequenceNumber CalcMaxVisibleSeq( + const std::map& unprep_seqs, + SequenceNumber snapshot_seq) { + SequenceNumber max_unprepared = 0; + if (unprep_seqs.size()) { + max_unprepared = + unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; + } return std::max(max_unprepared, snapshot_seq); } - static SequenceNumber CalcMaxUnpreparedSequenceNumber( - WriteUnpreparedTxn* txn); WritePreparedTxnDB* db_; - WriteUnpreparedTxn* txn_; + const std::map& unprep_seqs_; SequenceNumber wup_snapshot_; }; @@ -124,8 +127,6 @@ class WriteUnpreparedTxn : public WritePreparedTxn { virtual Status RebuildFromWriteBatch(WriteBatch*) override; - const std::map& GetUnpreparedSequenceNumbers(); - protected: void Initialize(const TransactionOptions& txn_options) override; @@ -156,6 +157,8 @@ class WriteUnpreparedTxn : public WritePreparedTxn { friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test; friend class WriteUnpreparedTxnDB; + const std::map& GetUnpreparedSequenceNumbers(); + Status MaybeFlushWriteBatchToDB(); Status FlushWriteBatchToDB(bool prepared); Status HandleWrite(std::function do_write); diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index c3fcd1f45d2..875d5416763 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -348,7 +348,8 @@ struct WriteUnpreparedTxnDB::IteratorState { IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, std::shared_ptr s, SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn) - : callback(txn_db, sequence, min_uncommitted, txn), snapshot(s) {} + : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_), + snapshot(s) {} SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); } WriteUnpreparedTxnReadCallback callback; @@ -384,27 +385,22 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, // foo: v5 5 // // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3, - // which is the last visible key. + // which is the last visible value. // // For unprepared transactions, if we have snap_seq = 3, but the current - // transaction has unprep_seq 5, then returning the first non-visible key + // transaction has unprep_seq 5, then returning the first non-visible value // would be incorrect, as we should return v5, and not v3. The problem is that - // there are committed keys at snapshot_seq < commit_seq < unprep_seq. + // there are committed values at snapshot_seq < commit_seq < unprep_seq. // // Snapshot validation can prevent this problem by ensuring that no committed - // keys exist at snapshot_seq < commit_seq, and thus any value with a sequence - // number greater than snapshot_seq must be unprepared keys. For example, if - // the transaction had a snapshot at 3, then snapshot validation would be - // performed during the Put(v5) call. It would find v4, and the Put would fail - // with snapshot validation failure. - // - // Because of this, if any writes have occurred, then the transaction snapshot - // must be used for the iterator. If no writes have occurred though, we can - // simply create a snapshot. Later writes would not be visible though, but we - // don't support iterating while writing anyway. + // values exist at snapshot_seq < commit_seq, and thus any value with a + // sequence number greater than snapshot_seq must be unprepared values. For + // example, if the transaction had a snapshot at 3, then snapshot validation + // would be performed during the Put(v5) call. It would find v4, and the Put + // would fail with snapshot validation failure. // // TODO(lth): Improve Prev() logic to continue iterating until - // max_visible_seq, and then return the last visible key, so that this + // max_visible_seq, and then return the last visible value, so that this // restriction can be lifted. const Snapshot* snapshot = nullptr; if (options.snapshot == nullptr) { @@ -418,9 +414,9 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, assert(snapshot_seq != kMaxSequenceNumber); // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are // guaranteed that for keys that were modified by this transaction (and thus - // might have unprepared versions), no committed versions exist at + // might have unprepared values), no committed values exist at // largest_validated_seq < commit_seq (or the contrapositive: any committed - // version must exist at commit_seq <= largest_validated_seq). This implies + // value must exist at commit_seq <= largest_validated_seq). This implies // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <= // snapshot_seq. As explained above, the problem with Prev() only happens when // snapshot_seq < commit_seq. From 5daa426a18bf5349584154b51a5404f2b1b69d1a Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 24 Jul 2019 12:04:58 -0700 Subject: [PATCH 252/572] Fix regression bug of Auto rolling logger when handling failures (#5622) Summary: Auto roll logger fails to handle file creation error in the correct way, which may expose to seg fault condition to users. Fix it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5622 Test Plan: Add a unit test on creating file under a non-existing directory. The test fails without the fix. Differential Revision: D16460853 fbshipit-source-id: e96da4bef4f16db171ea04a11b2ec5a9448ddbde --- logging/auto_roll_logger.cc | 5 ++--- logging/auto_roll_logger_test.cc | 9 +++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index ec240f5a334..223dfbe303c 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -46,9 +46,8 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, } GetExistingFiles(); ResetLogger(); - s = TrimOldLogFiles(); - if (!status_.ok()) { - status_ = s; + if (status_.ok()) { + status_ = TrimOldLogFiles(); } } diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index fa668114cfb..dd279d62a25 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -635,6 +635,15 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) { delete db; } +TEST_F(AutoRollLoggerTest, FileCreateFailure) { + Options options; + options.max_log_file_size = 100 * 1024 * 1024; + options.db_log_dir = "/a/dir/does/not/exist/at/all"; + + std::shared_ptr logger; + ASSERT_NOK(CreateLoggerFromOptions("", options, &logger)); + ASSERT_TRUE(!logger); +} } // namespace rocksdb int main(int argc, char** argv) { From 7260347fd1af7d6f631bd4263368c8fd2a3bbbf2 Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 24 Jul 2019 15:11:36 -0700 Subject: [PATCH 253/572] Auto Roll Logger to add some extra checking to avoid segfault. (#5623) Summary: AutoRollLogger sets GetStatus() to be non-OK if the log file fails to be created and logger_ is set to null. It is left to the caller to check the status before calling function to this class. There is no harm to create another null checking to logger_ before we using it, so that in case users mis-use the logger, they don't get a segfault. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5623 Test Plan: Run all existing tests. Differential Revision: D16466251 fbshipit-source-id: 262b885eec28bf741d91e9191c3cb5ff964e1bce --- logging/auto_roll_logger.cc | 14 +++++++++++++- logging/auto_roll_logger.h | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index 223dfbe303c..3109f0bc69c 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -155,6 +155,11 @@ std::string AutoRollLogger::ValistToString(const char* format, void AutoRollLogger::LogInternal(const char* format, ...) { mutex_.AssertHeld(); + + if (!logger_) { + return; + } + va_list args; va_start(args, format); logger_->Logv(format, args); @@ -163,7 +168,10 @@ void AutoRollLogger::LogInternal(const char* format, ...) { void AutoRollLogger::Logv(const char* format, va_list ap) { assert(GetStatus().ok()); - + if (!logger_) { + return; + } + std::shared_ptr logger; { MutexLock l(&mutex_); @@ -207,6 +215,10 @@ void AutoRollLogger::WriteHeaderInfo() { } void AutoRollLogger::LogHeader(const char* format, va_list args) { + if (!logger_) { + return; + } + // header message are to be retained in memory. Since we cannot make any // assumptions about the data contained in va_list, we will retain them as // strings diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h index a14fbfd5892..45cbc2697a1 100644 --- a/logging/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -41,6 +41,10 @@ class AutoRollLogger : public Logger { } size_t GetLogFileSize() const override { + if (!logger_) { + return 0; + } + std::shared_ptr logger; { MutexLock l(&mutex_); From d9dc6b4637276740a19ff8f649fc0d634342e960 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Wed, 24 Jul 2019 15:17:55 -0700 Subject: [PATCH 254/572] Declare snapshot refresh incompatible with delete range (#5625) Summary: The ::snap_refresh_nanos option is incompatible with DeleteRange feature. Currently the code relies on range_del_agg.IsEmpty() to disable it if there are range delete tombstones. However ::IsEmpty does not guarantee that there is no RangeDelete tombstones in the SST files. The patch declares the two features incompatible in inline comments until we later figure how to properly detect the presence of RangeDelete tombstones in compaction inputs. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5625 Differential Revision: D16468218 Pulled By: maysamyabandeh fbshipit-source-id: bd7beca278bc7e1db75e7ee4522d05a3a6ca86f4 --- include/rocksdb/options.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 35c27556553..896beba23fc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -275,6 +275,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // this option helps reducing the cpu usage of long-running compactions. The // feature is disabled when max_subcompactions is greater than one. // + // NOTE: This feautre is currently incompatible with RangeDeletes. + // // Default: 0 // // Dynamically changeable through SetOptions() API From 0d16fad51b5b8ad41ccc70faab11599f7120b093 Mon Sep 17 00:00:00 2001 From: Chad Austin Date: Thu, 25 Jul 2019 11:42:31 -0700 Subject: [PATCH 255/572] rocksdb: build on macosx Summary: Make rocksdb build on macos: 1) Reorganize OS-specific flags and deps in rocksdb/src/TARGETS 2) Sandbox fbcode apple platform builds from repo root include path (which conflicts with layout of rocksdb headers). 3) Fix dep-translation for bzip2. Reviewed By: andrewjcg Differential Revision: D15125826 fbshipit-source-id: 8e143c689b88b5727e54881a5e80500f879a320b --- TARGETS | 86 ++++++++++++++++++++++++++++++++++++++++---------------- defs.bzl | 6 +++- 2 files changed, 67 insertions(+), 25 deletions(-) diff --git a/TARGETS b/TARGETS index ba6f96c0b5f..c0c6fd97fad 100644 --- a/TARGETS +++ b/TARGETS @@ -6,29 +6,9 @@ REPO_PATH = package_name() + "/" ROCKSDB_COMPILER_FLAGS = [ "-fno-builtin-memcmp", - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", - "-DROCKSDB_FALLOCATE_PRESENT", - "-DROCKSDB_MALLOC_USABLE_SIZE", - "-DROCKSDB_RANGESYNC_PRESENT", - "-DROCKSDB_SCHED_GETCPU_PRESENT", - "-DROCKSDB_SUPPORT_THREAD_LOCAL", - "-DOS_LINUX", - # Flags to enable libs we include - "-DSNAPPY", - "-DZLIB", - "-DBZIP2", - "-DLZ4", - "-DZSTD", - "-DZSTD_STATIC_LINKING_ONLY", - "-DGFLAGS=gflags", - "-DNUMA", - "-DTBB", # Needed to compile in fbcode "-Wno-expansion-to-defined", # Added missing flags from output of build_detect_platform - "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", - "-DROCKSDB_BACKTRACE", "-Wnarrowing", "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] @@ -41,11 +21,54 @@ ROCKSDB_EXTERNAL_DEPS = [ ("lz4", None, "lz4"), ("zstd", None), ("tbb", None), - ("numa", None, "numa"), ("googletest", None, "gtest"), ] +ROCKSDB_OS_DEPS = [ + ( + "linux", + ["third-party//numa:numa"], + ), +] + +ROCKSDB_OS_PREPROCESSOR_FLAGS = [ + ( + "linux", + [ + "-DOS_LINUX", + "-DROCKSDB_FALLOCATE_PRESENT", + "-DROCKSDB_MALLOC_USABLE_SIZE", + "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", + "-DROCKSDB_RANGESYNC_PRESENT", + "-DROCKSDB_SCHED_GETCPU_PRESENT", + "-DHAVE_SSE42", + "-DNUMA", + ], + ), + ( + "macos", + ["-DOS_MACOSX"], + ), +] + ROCKSDB_PREPROCESSOR_FLAGS = [ + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DROCKSDB_SUPPORT_THREAD_LOCAL", + + # Flags to enable libs we include + "-DSNAPPY", + "-DZLIB", + "-DBZIP2", + "-DLZ4", + "-DZSTD", + "-DZSTD_STATIC_LINKING_ONLY", + "-DGFLAGS=gflags", + "-DTBB", + + # Added missing flags from output of build_detect_platform + "-DROCKSDB_BACKTRACE", + # Directories with files for #include "-I" + REPO_PATH + "include/", "-I" + REPO_PATH, @@ -53,7 +76,6 @@ ROCKSDB_PREPROCESSOR_FLAGS = [ ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { "x86_64": [ - "-DHAVE_SSE42", "-DHAVE_PCLMUL", ], } @@ -70,9 +92,15 @@ sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else []) +ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( + "linux", + ["-DROCKSDB_JEMALLOC"], +)] if sanitizer == "" else []) -ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else []) +ROCKSDB_OS_DEPS += ([( + "linux", + ["third-party//jemalloc:headers"], +)] if sanitizer == "" else []) cpp_library( name = "rocksdb_lib", @@ -308,6 +336,8 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -328,6 +358,8 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -344,6 +376,8 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -355,6 +389,8 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_test_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -1113,6 +1149,8 @@ ROCKS_TESTS = [ rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, + rocksdb_os_deps = ROCKSDB_OS_DEPS, + rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, test_cc = test_cc, test_name = test_name, diff --git a/defs.bzl b/defs.bzl index f3e8339783e..a9f25ebcc42 100644 --- a/defs.bzl +++ b/defs.bzl @@ -8,9 +8,11 @@ def test_binary( test_cc, parallelism, rocksdb_arch_preprocessor_flags, + rocksdb_os_preprocessor_flags, rocksdb_compiler_flags, rocksdb_preprocessor_flags, - rocksdb_external_deps): + rocksdb_external_deps, + rocksdb_os_deps): TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh" ttype = "gtest" if parallelism == "parallel" else "simple" @@ -20,9 +22,11 @@ def test_binary( name = test_bin, srcs = [test_cc], arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, + os_preprocessor_flags = rocksdb_os_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, deps = [":rocksdb_test_lib"], + os_deps = rocksdb_os_deps, external_deps = rocksdb_external_deps, ) From ae152ee666c34b31c4bb0fa5a8fdf46a6b5ea93b Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 25 Jul 2019 15:23:46 -0700 Subject: [PATCH 256/572] Avoid user key copying for Get/Put/Write with user-timestamp (#5502) Summary: In previous https://github.com/facebook/rocksdb/issues/5079, we added user-specified timestamp to `DB::Get()` and `DB::Put()`. Limitation is that these two functions may cause extra memory allocation and key copy. The reason is that `WriteBatch` does not allocate extra memory for timestamps because it is not aware of timestamp size, and we did not provide an API to assign/update timestamp of each key within a `WriteBatch`. We address these issues in this PR by doing the following. 1. Add a `timestamp_size_` to `WriteBatch` so that `WriteBatch` can take timestamps into account when calling `WriteBatch::Put`, `WriteBatch::Delete`, etc. 2. Add APIs `WriteBatch::AssignTimestamp` and `WriteBatch::AssignTimestamps` so that application can assign/update timestamps for each key in a `WriteBatch`. 3. Avoid key copy in `GetImpl` by adding new constructor to `LookupKey`. Test plan (on devserver): ``` $make clean && COMPILE_WITH_ASAN=1 make -j32 all $./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/* $make check ``` If the API extension looks good, I will add more unit tests. Some simple benchmark using db_bench. ``` $rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000 $rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 -disable_wal=true ``` Master is at a78503bd6c80a3c4137df1962a972fe406b4d90b. ``` | | readrandom | fillrandom | | master | 15.53 MB/s | 25.97 MB/s | | PR5502 | 16.70 MB/s | 25.80 MB/s | ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5502 Differential Revision: D16340894 Pulled By: riversand963 fbshipit-source-id: 51132cf792be07d1efc3ac33f5768c4ee2608bb8 --- .gitignore | 1 + db/db_impl/db_impl.cc | 13 +-- db/db_impl/db_impl_write.cc | 12 +-- db/dbformat.cc | 12 ++- db/dbformat.h | 16 ---- db/lookup_key.h | 3 +- db/write_batch.cc | 146 ++++++++++++++++++++++++++++++++-- include/rocksdb/write_batch.h | 9 +++ util/coding.h | 16 +++- 9 files changed, 183 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 7a799c09a9d..c8672a8b31e 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ rocksdb_undump db_test2 trace_analyzer trace_analyzer_test +block_cache_trace_analyzer .DS_Store java/out diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 8132d5a0b38..54e401ddd5a 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1441,16 +1441,7 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - if (nullptr == read_options.timestamp) { - return GetImpl(read_options, column_family, key, value); - } - Slice akey; - std::string buf; - Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf); - if (s.ok()) { - s = GetImpl(read_options, column_family, akey, value); - } - return s; + return GetImpl(read_options, column_family, key, value); } Status DBImpl::GetImpl(const ReadOptions& read_options, @@ -1528,7 +1519,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. - LookupKey lkey(key, snapshot); + LookupKey lkey(key, snapshot, read_options.timestamp); PERF_TIMER_STOP(get_snapshot_time); bool skip_memtable = (read_options.read_tier == kPersistedTier && diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 95a1b31c769..0ad2a3e9a86 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1734,14 +1734,16 @@ Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, } return Write(opt, &batch); } - Slice akey; - std::string buf; - Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf); + const Slice* ts = opt.timestamp; + assert(nullptr != ts); + size_t ts_sz = ts->size(); + WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0, + ts_sz); + Status s = batch.Put(column_family, key, value); if (!s.ok()) { return s; } - WriteBatch batch(akey.size() + value.size() + 24); - s = batch.Put(column_family, akey, value); + s = batch.AssignTimestamp(*ts); if (!s.ok()) { return s; } diff --git a/db/dbformat.cc b/db/dbformat.cc index bfaea868b53..130ba4e8adf 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -159,9 +159,11 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } -LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) { +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, + const Slice* ts) { size_t usize = _user_key.size(); - size_t needed = usize + 13; // A conservative estimate + size_t ts_sz = (nullptr == ts) ? 0 : ts->size(); + size_t needed = usize + ts_sz + 13; // A conservative estimate char* dst; if (needed <= sizeof(space_)) { dst = space_; @@ -170,10 +172,14 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) { } start_ = dst; // NOTE: We don't support users keys of more than 2GB :) - dst = EncodeVarint32(dst, static_cast(usize + 8)); + dst = EncodeVarint32(dst, static_cast(usize + ts_sz + 8)); kstart_ = dst; memcpy(dst, _user_key.data(), usize); dst += usize; + if (nullptr != ts) { + memcpy(dst, ts->data(), ts_sz); + dst += ts_sz; + } EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); dst += 8; end_ = dst; diff --git a/db/dbformat.h b/db/dbformat.h index c6ee5677c09..1d9b7ef7e3f 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -669,20 +669,4 @@ struct ParsedInternalKeyComparator { const InternalKeyComparator* cmp; }; -// TODO (yanqin): this causes extra memory allocation and copy. Should be -// addressed in the future. -inline Status AppendTimestamp(const Slice& key, const Slice& timestamp, - Slice* ret_key, std::string* ret_buf) { - assert(ret_key != nullptr); - assert(ret_buf != nullptr); - if (key.data() + key.size() == timestamp.data()) { - *ret_key = Slice(key.data(), key.size() + timestamp.size()); - } else { - ret_buf->assign(key.data(), key.size()); - ret_buf->append(timestamp.data(), timestamp.size()); - *ret_key = Slice(*ret_buf); - } - return Status::OK(); -} - } // namespace rocksdb diff --git a/db/lookup_key.h b/db/lookup_key.h index ddf4ff0e942..1b0f6f56290 100644 --- a/db/lookup_key.h +++ b/db/lookup_key.h @@ -21,7 +21,8 @@ class LookupKey { public: // Initialize *this for looking up user_key at a snapshot with // the specified sequence number. - LookupKey(const Slice& _user_key, SequenceNumber sequence); + LookupKey(const Slice& _user_key, SequenceNumber sequence, + const Slice* ts = nullptr); ~LookupKey(); diff --git a/db/write_batch.cc b/db/write_batch.cc index d7a2e792a33..2c2d81e87f6 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -135,6 +135,105 @@ struct BatchContentClassifier : public WriteBatch::Handler { } }; +class TimestampAssigner : public WriteBatch::Handler { + public: + explicit TimestampAssigner(const Slice& ts) + : timestamp_(ts), timestamps_(kEmptyTimestampList) {} + explicit TimestampAssigner(const std::vector& ts_list) + : timestamps_(ts_list) { + SanityCheck(); + } + ~TimestampAssigner() override {} + + Status PutCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice& begin_key, + const Slice& end_key) override { + AssignTimestamp(begin_key); + AssignTimestamp(end_key); + ++idx_; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + // TODO (yanqin): support blob db in the future. + return Status::OK(); + } + + Status MarkBeginPrepare(bool) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + private: + void SanityCheck() const { + assert(!timestamps_.empty()); +#ifndef NDEBUG + const size_t ts_sz = timestamps_[0].size(); + for (size_t i = 1; i != timestamps_.size(); ++i) { + assert(ts_sz == timestamps_[i].size()); + } +#endif // !NDEBUG + } + + void AssignTimestamp(const Slice& key) { + assert(timestamps_.empty() || idx_ < timestamps_.size()); + const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_]; + size_t ts_sz = ts.size(); + char* ptr = const_cast(key.data() + key.size() - ts_sz); + memcpy(ptr, ts.data(), ts_sz); + } + + static const std::vector kEmptyTimestampList; + const Slice timestamp_; + const std::vector& timestamps_; + size_t idx_ = 0; + + // No copy or move. + TimestampAssigner(const TimestampAssigner&) = delete; + TimestampAssigner(TimestampAssigner&&) = delete; + TimestampAssigner& operator=(const TimestampAssigner&) = delete; + TimestampAssigner&& operator=(TimestampAssigner&&) = delete; +}; +const std::vector TimestampAssigner::kEmptyTimestampList; + } // anon namespace struct SavePoints { @@ -142,7 +241,15 @@ struct SavePoints { }; WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) - : content_flags_(0), max_bytes_(max_bytes), rep_() { + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) { + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz) + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) { rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? reserved_bytes : WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader); @@ -151,18 +258,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) WriteBatch::WriteBatch(const std::string& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), - rep_(rep) {} + rep_(rep), + timestamp_size_(0) {} WriteBatch::WriteBatch(std::string&& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), - rep_(std::move(rep)) {} + rep_(std::move(rep)), + timestamp_size_(0) {} WriteBatch::WriteBatch(const WriteBatch& src) : wal_term_point_(src.wal_term_point_), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(src.rep_) { + rep_(src.rep_), + timestamp_size_(src.timestamp_size_) { if (src.save_points_ != nullptr) { save_points_.reset(new SavePoints()); save_points_->stack = src.save_points_->stack; @@ -174,7 +284,8 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept wal_term_point_(std::move(src.wal_term_point_)), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(std::move(src.rep_)) {} + rep_(std::move(src.rep_)), + timestamp_size_(src.timestamp_size_) {} WriteBatch& WriteBatch::operator=(const WriteBatch& src) { if (&src != this) { @@ -643,7 +754,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - PutLengthPrefixedSlice(&b->rep_, key); + if (0 == b->timestamp_size_) { + PutLengthPrefixedSlice(&b->rep_, key); + } else { + PutVarint32(&b->rep_, + static_cast(key.size() + b->timestamp_size_)); + b->rep_.append(key.data(), key.size()); + b->rep_.append(b->timestamp_size_, '\0'); + } PutLengthPrefixedSlice(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, @@ -692,7 +810,11 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - PutLengthPrefixedSliceParts(&b->rep_, key); + if (0 == b->timestamp_size_) { + PutLengthPrefixedSliceParts(&b->rep_, key); + } else { + PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_); + } PutLengthPrefixedSliceParts(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, @@ -1038,6 +1160,16 @@ Status WriteBatch::PopSavePoint() { return Status::OK(); } +Status WriteBatch::AssignTimestamp(const Slice& ts) { + TimestampAssigner ts_assigner(ts); + return Iterate(&ts_assigner); +} + +Status WriteBatch::AssignTimestamps(const std::vector& ts_list) { + TimestampAssigner ts_assigner(ts_list); + return Iterate(&ts_assigner); +} + class MemTableInserter : public WriteBatch::Handler { SequenceNumber sequence_; diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 29b660d1987..393c5d9c6ab 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "rocksdb/status.h" #include "rocksdb/write_batch_base.h" @@ -60,6 +61,7 @@ struct SavePoint { class WriteBatch : public WriteBatchBase { public: explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0); + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz); ~WriteBatch() override; using WriteBatchBase::Put; @@ -311,6 +313,12 @@ class WriteBatch : public WriteBatchBase { // Returns trie if MarkRollback will be called during Iterate bool HasRollback() const; + // Assign timestamp to write batch + Status AssignTimestamp(const Slice& ts); + + // Assign timestamps to write batch + Status AssignTimestamps(const std::vector& ts_list); + using WriteBatchBase::GetWriteBatch; WriteBatch* GetWriteBatch() override { return this; } @@ -361,6 +369,7 @@ class WriteBatch : public WriteBatchBase { protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ + const size_t timestamp_size_; // Intentionally copyable }; diff --git a/util/coding.h b/util/coding.h index 9427d52618e..3ad6d957007 100644 --- a/util/coding.h +++ b/util/coding.h @@ -50,6 +50,8 @@ extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); extern void PutLengthPrefixedSliceParts(std::string* dst, const SliceParts& slice_parts); +extern void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz); // Standard Get... routines parse a value from the beginning of a Slice // and advance the slice past the parsed value. @@ -306,9 +308,8 @@ inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { dst->append(value.data(), value.size()); } -inline void PutLengthPrefixedSliceParts(std::string* dst, +inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes, const SliceParts& slice_parts) { - size_t total_bytes = 0; for (int i = 0; i < slice_parts.num_parts; ++i) { total_bytes += slice_parts.parts[i].size(); } @@ -318,6 +319,17 @@ inline void PutLengthPrefixedSliceParts(std::string* dst, } } +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts); +} + +inline void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts); + dst->append(pad_sz, '\0'); +} + inline int VarintLength(uint64_t v) { int len = 1; while (v >= 128) { From 9625a2bc2b56c92487922c192f4b903083e63c2c Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Thu, 25 Jul 2019 22:38:53 -0700 Subject: [PATCH 257/572] Added SizeApproximationOptions to DB::GetApproximateSizes (#5626) Summary: The new DB::GetApproximateSizes with SizeApproximationOptions argument, which allows to add more options/knobs to the DB::GetApproximateSizes call (beyond only the include_flags) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5626 Differential Revision: D16496913 Pulled By: elipoz fbshipit-source-id: ee8c6c182330a285fa056ecfc3905a592b451720 --- HISTORY.md | 2 ++ db/db_impl/db_impl.cc | 17 ++++++++++------- db/db_impl/db_impl.h | 7 ++++--- db/db_test.cc | 9 +++++---- include/rocksdb/db.h | 24 +++++++++++++++++------- include/rocksdb/options.h | 14 ++++++++++---- include/rocksdb/utilities/stackable_db.h | 9 +++++---- 7 files changed, 53 insertions(+), 29 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 59205341020..ace55cab404 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -21,6 +21,7 @@ * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env. +* Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors. ### New Features * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. @@ -29,6 +30,7 @@ * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. +* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 54e401ddd5a..16a6d86a658 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2770,11 +2770,13 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, ReturnAndCleanupSuperVersion(cfd, sv); } -void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - uint8_t include_flags) { - assert(include_flags & DB::SizeApproximationFlags::INCLUDE_FILES || - include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES); +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtabtles && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + Version* v; auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -2786,18 +2788,19 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; - if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) { + if (options.include_files) { sizes[i] += versions_->ApproximateSize( v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } - if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) { + if (options.include_memtabtles) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; } } ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); } std::list::iterator diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 547e3e1d6be..fe3a2f6f20f 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -233,9 +233,10 @@ class DBImpl : public DB { virtual bool GetAggregatedIntProperty(const Slice& property, uint64_t* aggregated_value) override; using DB::GetApproximateSizes; - virtual void GetApproximateSizes( - ColumnFamilyHandle* column_family, const Range* range, int n, - uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; using DB::GetApproximateMemTableStats; virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, const Range& range, diff --git a/db/db_test.cc b/db/db_test.cc index 36bdda59e21..f247ddb80fa 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2598,13 +2598,14 @@ class ModelDB : public DB { return false; } using DB::GetApproximateSizes; - void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/, - const Range* /*range*/, int n, uint64_t* sizes, - uint8_t /*include_flags*/ - = INCLUDE_FILES) override { + Status GetApproximateSizes(const SizeApproximationOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Range* /*range*/, int n, + uint64_t* sizes) override { for (int i = 0; i < n; i++) { sizes[i] = 0; } + return Status::OK(); } using DB::GetApproximateMemTableStats; void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/, diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index d90ca900f45..1d90dc50b4b 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -808,7 +808,7 @@ class DB { // stats should be included, or file stats approximation or both enum SizeApproximationFlags : uint8_t { NONE = 0, - INCLUDE_MEMTABLES = 1, + INCLUDE_MEMTABLES = 1 << 0, INCLUDE_FILES = 1 << 1 }; @@ -818,14 +818,24 @@ class DB { // Note that the returned sizes measure file system space usage, so // if the user data compresses by a factor of ten, the returned // sizes will be one-tenth the size of the corresponding user data size. - // - // If include_flags defines whether the returned size should include - // the recently written data in the mem-tables (if - // the mem-table type supports it), data serialized to disk, or both. - // include_flags should be of type DB::SizeApproximationFlags + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) = 0; + + // Simpler versions of the GetApproximateSizes() method above. + // The include_flags argumenbt must of type DB::SizeApproximationFlags + // and can not be NONE. virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) = 0; + uint8_t include_flags = INCLUDE_FILES) { + SizeApproximationOptions options; + options.include_memtabtles = + (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; + options.include_files = + (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; + GetApproximateSizes(options, column_family, range, n, sizes); + } virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) { GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 896beba23fc..5ae010b8f52 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1093,10 +1093,6 @@ struct DBOptions { // The number of bytes to prefetch when reading the log. This is mostly useful // for reading a remotely located log, as it can save the number of // round-trips. If 0, then the prefetching is disabled. - - // If non-zero, we perform bigger reads when reading the log. - // This is mostly useful for reading a remotely located log, as it can save - // the number of round-trips. If 0, then the prefetching is disabled. // // Default: 0 size_t log_readahead_size = 0; @@ -1510,4 +1506,14 @@ struct ImportColumnFamilyOptions { bool move_files = false; }; +// Options used with DB::GetApproximateSizes() +struct SizeApproximationOptions { + // Defines whether the returned size should include the recently written + // data in the mem-tables. If set to false, include_files must be true. + bool include_memtabtles = false; + // Defines whether the returned size should include data serialized to disk. + // If set to false, include_memtabtles must be true. + bool include_files = true; +}; + } // namespace rocksdb diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index a52aff5d8b1..67bf4e2fa6b 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -209,10 +209,11 @@ class StackableDB : public DB { } using DB::GetApproximateSizes; - virtual void GetApproximateSizes( - ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) override { - return db_->GetApproximateSizes(column_family, r, n, sizes, include_flags); + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(options, column_family, r, n, sizes); } using DB::GetApproximateMemTableStats; From 74782cec325e32824699bd4385df2c914d261721 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 26 Jul 2019 09:52:23 -0700 Subject: [PATCH 258/572] Fix target 'clean' to include parallel test binaries (#5629) Summary: current `clean` target in Makefile does not remove parallel test binaries. Fix this. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5629 Test Plan: (on devserver) Take file_reader_writer_test for instance. ``` $make -j32 file_reader_writer_test $make clean ``` Verify that binary file 'file_reader_writer_test' is delete by `make clean`. Differential Revision: D16513176 Pulled By: riversand963 fbshipit-source-id: 70acb9f56c928a494964121b86aacc0090f31ff6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 65d884fa4fb..7f5772c6273 100644 --- a/Makefile +++ b/Makefile @@ -1068,7 +1068,7 @@ rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc clean: - rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED) + rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report $(FIND) . -name "*.[oda]" -exec rm -f {} \; $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; From 230b909da8e9a9c807ed559f45c2d9a3dae4aa78 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Fri, 26 Jul 2019 11:31:46 -0700 Subject: [PATCH 259/572] Fix PopSavePoint to merge info into the previous savepoint (#5628) Summary: Transaction::RollbackToSavePoint undos the modification made since the SavePoint beginning, and also unlocks the corresponding keys, which are tracked in the last SavePoint. Currently ::PopSavePoint simply discard these tracked keys, leaving them locked in the lock manager. This breaks a subsequent ::RollbackToSavePoint behavior as it loses track of such keys, and thus cannot unlock them. The patch fixes ::PopSavePoint by passing on the track key information to the previous SavePoint. Fixes https://github.com/facebook/rocksdb/issues/5618 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5628 Differential Revision: D16505325 Pulled By: lth fbshipit-source-id: 2bc3b30963ab4d36d996d1f66543c93abf358980 --- utilities/transactions/transaction_base.cc | 43 +++++++++++++++++- utilities/transactions/transaction_base.h | 10 +++-- utilities/transactions/transaction_test.cc | 52 ++++++++++++++++++++++ utilities/transactions/transaction_util.h | 8 ++++ 4 files changed, 108 insertions(+), 5 deletions(-) diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 5621a7fa372..bf59a1c4069 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -192,7 +192,48 @@ Status TransactionBaseImpl::PopSavePoint() { } assert(!save_points_->empty()); - save_points_->pop(); + // If there is another savepoint A below the current savepoint B, then A needs + // to inherit tracked_keys in B so that if we rollback to savepoint A, we + // remember to unlock keys in B. If there is no other savepoint below, then we + // can safely discard savepoint info. + if (save_points_->size() == 1) { + save_points_->pop(); + } else { + TransactionBaseImpl::SavePoint top; + std::swap(top, save_points_->top()); + save_points_->pop(); + + const TransactionKeyMap& curr_cf_key_map = top.new_keys_; + TransactionKeyMap& prev_cf_key_map = save_points_->top().new_keys_; + + for (const auto& curr_cf_key_iter : curr_cf_key_map) { + uint32_t column_family_id = curr_cf_key_iter.first; + const std::unordered_map& curr_keys = + curr_cf_key_iter.second; + + // If cfid was not previously tracked, just copy everything over. + auto prev_keys_iter = prev_cf_key_map.find(column_family_id); + if (prev_keys_iter == prev_cf_key_map.end()) { + prev_cf_key_map.emplace(curr_cf_key_iter); + } else { + std::unordered_map& prev_keys = + prev_keys_iter->second; + for (const auto& key_iter : curr_keys) { + const std::string& key = key_iter.first; + const TransactionKeyMapInfo& info = key_iter.second; + // If key was not previously tracked, just copy the whole struct over. + // Otherwise, some merging needs to occur. + auto prev_info = prev_keys.find(key); + if (prev_info == prev_keys.end()) { + prev_keys.emplace(key_iter); + } else { + prev_info->second.Merge(info); + } + } + } + } + } + return write_batch_.PopSavePoint(); } diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 26efd51b378..657e9c59656 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -294,11 +294,11 @@ class TransactionBaseImpl : public Transaction { struct SavePoint { std::shared_ptr snapshot_; - bool snapshot_needed_; + bool snapshot_needed_ = false; std::shared_ptr snapshot_notifier_; - uint64_t num_puts_; - uint64_t num_deletes_; - uint64_t num_merges_; + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; // Record all keys tracked since the last savepoint TransactionKeyMap new_keys_; @@ -312,6 +312,8 @@ class TransactionBaseImpl : public Transaction { num_puts_(num_puts), num_deletes_(num_deletes), num_merges_(num_merges) {} + + SavePoint() = default; }; // Records writes pending in this transaction diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 7868d0060e9..534103a545e 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -4030,6 +4030,58 @@ TEST_P(TransactionTest, SavepointTest3) { ASSERT_TRUE(s.IsNotFound()); } +TEST_P(TransactionTest, SavepointTest4) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->SetSavePoint(); // 1 + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Remove 2 + ASSERT_OK(s); + + // Verify that A/B still exists. + std::string value; + ASSERT_OK(txn1->Get(read_options, "A", &value)); + ASSERT_EQ("a", value); + + ASSERT_OK(txn1->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1 + + // Verify that everything was rolled back. + s = txn1->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Nothing should be locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", ""); + ASSERT_OK(s); + + s = txn2->Put("B", ""); + ASSERT_OK(s); + + delete txn2; + delete txn1; +} + TEST_P(TransactionTest, UndoGetForUpdateTest) { WriteOptions write_options; ReadOptions read_options; diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h index 1d910134b66..b1f9f24cb4e 100644 --- a/utilities/transactions/transaction_util.h +++ b/utilities/transactions/transaction_util.h @@ -31,6 +31,14 @@ struct TransactionKeyMapInfo { explicit TransactionKeyMapInfo(SequenceNumber seq_no) : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {} + + // Used in PopSavePoint to collapse two savepoints together. + void Merge(const TransactionKeyMapInfo& info) { + assert(seq <= info.seq); + num_reads += info.num_reads; + num_writes += info.num_writes; + exclusive |= info.exclusive; + } }; using TransactionKeyMap = From 3617287e0ec4593587c59909079d40a32209bbe4 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 26 Jul 2019 11:44:32 -0700 Subject: [PATCH 260/572] Parallelize db_bloom_filter_test (#5632) Summary: This test frequently times out under TSAN; parallelizing it should fix this issue. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5632 Test Plan: make check buck test mode/dev-tsan internal_repo_rocksdb/repo:db_bloom_filter_test Differential Revision: D16519399 Pulled By: ltamasi fbshipit-source-id: 66e05a644d6f79c6d544255ffcf6de195d2d62fe --- Makefile | 2 +- TARGETS | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7f5772c6273..7aef277d590 100644 --- a/Makefile +++ b/Makefile @@ -442,7 +442,6 @@ TESTS = \ db_block_cache_test \ db_test \ db_blob_index_test \ - db_bloom_filter_test \ db_iter_test \ db_iter_stress_test \ db_log_iter_test \ @@ -569,6 +568,7 @@ TESTS = \ PARALLEL_TEST = \ backupable_db_test \ + db_bloom_filter_test \ db_compaction_filter_test \ db_compaction_test \ db_merge_operator_test \ diff --git a/TARGETS b/TARGETS index c0c6fd97fad..a54e56b9835 100644 --- a/TARGETS +++ b/TARGETS @@ -586,7 +586,7 @@ ROCKS_TESTS = [ [ "db_bloom_filter_test", "db/db_bloom_filter_test.cc", - "serial", + "parallel", ], [ "db_compaction_filter_test", From 41df7348308fe74fb92bbfa0e330d863524a381a Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Fri, 26 Jul 2019 12:52:07 -0700 Subject: [PATCH 261/572] WriteUnPrepared: Add new variable write_batch_flush_threshold (#5633) Summary: Instead of reusing `TransactionOptions::max_write_batch_size` for determining when to flush a write batch for write unprepared, add a new variable called `write_batch_flush_threshold` for this use case instead. Also add `TransactionDBOptions::default_write_batch_flush_threshold` which sets the default value if `TransactionOptions::write_batch_flush_threshold` is unspecified. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5633 Differential Revision: D16520364 Pulled By: lth fbshipit-source-id: d75ae5a2141ce7708982d5069dc3f0b58d250e8c --- include/rocksdb/utilities/transaction_db.h | 10 +++++++ utilities/transactions/transaction_test.cc | 12 ++------- .../write_unprepared_transaction_test.cc | 12 ++++----- .../transactions/write_unprepared_txn.cc | 27 +++++++++++-------- utilities/transactions/write_unprepared_txn.h | 4 +-- 5 files changed, 36 insertions(+), 29 deletions(-) diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index db32ba0bc3a..33826bab861 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -101,6 +101,11 @@ struct TransactionDBOptions { // ordering rather than concurrency control. bool skip_concurrency_control = false; + // This option is only valid for write unprepared. If a write batch exceeds + // this threshold, then the transaction will implicitly flush the currently + // pending writes into the database. A value of 0 or less means no limit. + ssize_t default_write_batch_flush_threshold = 0; + private: // 128 entries size_t wp_snapshot_cache_bits = static_cast(7); @@ -162,6 +167,11 @@ struct TransactionOptions { // back/commit before new transactions start. // Default: false bool skip_concurrency_control = false; + + // See TransactionDBOptions::default_write_batch_flush_threshold for + // description. If a negative value is specified, then the default value from + // TransactionDBOptions is used. + ssize_t write_batch_flush_threshold = -1; }; // The per-write optimizations that do not involve transactions. TransactionDB diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 534103a545e..98548dd9555 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -5303,16 +5303,8 @@ TEST_P(TransactionTest, MemoryLimitTest) { ASSERT_EQ(2, txn->GetNumPuts()); s = txn->Put(Slice("b"), Slice("....")); - auto pdb = reinterpret_cast(db); - // For write unprepared, write batches exceeding max_write_batch_size will - // just flush to DB instead of returning a memory limit error. - if (pdb->GetTxnDBOptions().write_policy != WRITE_UNPREPARED) { - ASSERT_TRUE(s.IsMemoryLimit()); - ASSERT_EQ(2, txn->GetNumPuts()); - } else { - ASSERT_OK(s); - ASSERT_EQ(3, txn->GetNumPuts()); - } + ASSERT_TRUE(s.IsMemoryLimit()); + ASSERT_EQ(2, txn->GetNumPuts()); txn->Rollback(); delete txn; diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index a2546229e4d..feaedea067f 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -157,7 +157,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) { Transaction* txn; TransactionOptions txn_options; // batch_size of 1 causes writes to DB for every marker. - txn_options.max_write_batch_size = 1; + txn_options.write_batch_flush_threshold = 1; ReadOptions read_options; for (uint32_t i = 0; i < kNumIter; i++) { @@ -311,7 +311,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) { // batch_size of 1 causes writes to DB for every marker. for (size_t batch_size : {1, 1000000}) { - txn_options.max_write_batch_size = batch_size; + txn_options.write_batch_flush_threshold = batch_size; for (bool empty : {true, false}) { for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) { for (int num_batches = 1; num_batches < 10; num_batches++) { @@ -332,7 +332,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) { txn->SetName("xid"); for (int i = 0; i < num_batches; i++) { ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i))); - if (txn_options.max_write_batch_size == 1) { + if (txn_options.write_batch_flush_threshold == 1) { ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1); } else { ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); @@ -398,7 +398,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) { // batch_size of 1 causes writes to DB for every marker. for (size_t batch_size : {1, 1000000}) { - txn_options.max_write_batch_size = batch_size; + txn_options.write_batch_flush_threshold = batch_size; for (bool prepare : {false, true}) { for (bool commit : {false, true}) { ReOpen(); @@ -408,7 +408,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) { for (int i = 0; i < kNumKeys; i++) { txn->Put("k" + ToString(i), "v" + ToString(i)); - if (txn_options.max_write_batch_size == 1) { + if (txn_options.write_batch_flush_threshold == 1) { ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1); } else { ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); @@ -457,7 +457,7 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) { WriteOptions write_options; TransactionOptions txn_options; // batch_size of 1 causes writes to DB for every marker. - txn_options.max_write_batch_size = 1; + txn_options.write_batch_flush_threshold = 1; const int kNumKeys = 10; WriteOptions wopts; diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 9265c3d4afb..c677013aa03 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -35,13 +35,12 @@ WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, wupt_db_(txn_db), recovered_txn_(false), largest_validated_seq_(0) { - max_write_batch_size_ = txn_options.max_write_batch_size; - // We set max bytes to zero so that we don't get a memory limit error. - // Instead of trying to keep write batch strictly under the size limit, we - // just flush to DB when the limit is exceeded in write unprepared, to avoid - // having retry logic. This also allows very big key-value pairs that exceed - // max bytes to succeed. - write_batch_.SetMaxBytes(0); + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } } WriteUnpreparedTxn::~WriteUnpreparedTxn() { @@ -71,8 +70,13 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() { void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { PessimisticTransaction::Initialize(txn_options); - max_write_batch_size_ = txn_options.max_write_batch_size; - write_batch_.SetMaxBytes(0); + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } + unprep_seqs_.clear(); recovered_txn_ = false; largest_validated_seq_ = 0; @@ -222,8 +226,9 @@ Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) { Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { const bool kPrepared = true; Status s; - if (max_write_batch_size_ != 0 && - write_batch_.GetDataSize() > max_write_batch_size_) { + if (write_batch_flush_threshold_ > 0 && + write_batch_.GetDataSize() > + static_cast(write_batch_flush_threshold_)) { assert(GetState() != PREPARED); s = FlushWriteBatchToDB(!kPrepared); } diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index d81c30217df..feac749ee82 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -164,10 +164,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { Status HandleWrite(std::function do_write); // For write unprepared, we check on every writebatch append to see if - // max_write_batch_size_ has been exceeded, and then call + // write_batch_flush_threshold_ has been exceeded, and then call // FlushWriteBatchToDB if so. This logic is encapsulated in // MaybeFlushWriteBatchToDB. - size_t max_write_batch_size_; + ssize_t write_batch_flush_threshold_; WriteUnpreparedTxnDB* wupt_db_; // Ordered list of unprep_seq sequence numbers that we have already written From 70c7302fb5d343fa319e05327ecf88d09fe26a2b Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 26 Jul 2019 14:36:16 -0700 Subject: [PATCH 262/572] Block cache simulator: Add pysim to simulate caches using reinforcement learning. (#5610) Summary: This PR implements cache eviction using reinforcement learning. It includes two implementations: 1. An implementation of Thompson Sampling for the Bernoulli Bandit [1]. 2. An implementation of LinUCB with disjoint linear models [2]. The idea is that a cache uses multiple eviction policies, e.g., MRU, LRU, and LFU. The cache learns which eviction policy is the best and uses it upon a cache miss. Thompson Sampling is contextless and does not include any features. LinUCB includes features such as level, block type, caller, column family id to decide which eviction policy to use. [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. Trends Mach. Learn. 11, 1 (July 2018), 1-96. DOI: https://doi.org/10.1561/2200000070 [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. A contextual-bandit approach to personalized news article recommendation. In Proceedings of the 19th international conference on World wide web (WWW '10). ACM, New York, NY, USA, 661-670. DOI=http://dx.doi.org/10.1145/1772690.1772758 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5610 Differential Revision: D16435067 Pulled By: HaoyuHuang fbshipit-source-id: 6549239ae14115c01cb1e70548af9e46d8dc21bb --- .gitignore | 1 + CMakeLists.txt | 4 +- Makefile | 4 +- TARGETS | 6 +- src.mk | 6 +- tools/block_cache_analyzer/__init__.py | 2 + .../block_cache_analyzer/block_cache_pysim.py | 864 ++++++++++++++++++ .../block_cache_analyzer/block_cache_pysim.sh | 118 +++ .../block_cache_pysim_test.py | 340 +++++++ .../block_cache_trace_analyzer.cc | 14 +- .../block_cache_trace_analyzer.h | 0 .../block_cache_trace_analyzer_plot.py | 0 .../block_cache_trace_analyzer_test.cc | 4 +- .../block_cache_trace_analyzer_tool.cc | 2 +- 14 files changed, 1345 insertions(+), 20 deletions(-) create mode 100644 tools/block_cache_analyzer/__init__.py create mode 100644 tools/block_cache_analyzer/block_cache_pysim.py create mode 100644 tools/block_cache_analyzer/block_cache_pysim.sh create mode 100644 tools/block_cache_analyzer/block_cache_pysim_test.py rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer.cc (99%) rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer.h (100%) rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_plot.py (100%) rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_test.cc (99%) rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_tool.cc (91%) diff --git a/.gitignore b/.gitignore index c8672a8b31e..199458901ec 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ manifest_dump sst_dump blob_dump block_cache_trace_analyzer +tools/block_cache_analyzer/*.pyc column_aware_encoding_exp util/build_version.cc build_tools/VALGRIND_LOGS/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 086975f3e8f..7266f3b55c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -626,7 +626,7 @@ set(SOURCES test_util/sync_point_impl.cc test_util/testutil.cc test_util/transaction_test_util.cc - tools/block_cache_trace_analyzer.cc + tools/block_cache_analyzer/block_cache_trace_analyzer.cc tools/db_bench_tool.cc tools/dump/db_dump_tool.cc tools/ldb_cmd.cc @@ -976,7 +976,7 @@ if(WITH_TESTS) table/merger_test.cc table/sst_file_reader_test.cc table/table_test.cc - tools/block_cache_trace_analyzer_test.cc + tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc tools/ldb_cmd_test.cc tools/reduce_levels_test.cc tools/sst_dump_test.cc diff --git a/Makefile b/Makefile index 7aef277d590..fbe6d2d06ff 100644 --- a/Makefile +++ b/Makefile @@ -1114,7 +1114,7 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) $(AM_LINK) -block_cache_trace_analyzer: tools/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) +block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) $(AM_LINK) cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) @@ -1614,7 +1614,7 @@ db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -block_cache_trace_analyzer_test: tools/block_cache_trace_analyzer_test.o tools/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS) +block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) #------------------------------------------------- diff --git a/TARGETS b/TARGETS index a54e56b9835..884d69b14bc 100644 --- a/TARGETS +++ b/TARGETS @@ -351,7 +351,7 @@ cpp_library( "test_util/fault_injection_test_env.cc", "test_util/testharness.cc", "test_util/testutil.cc", - "tools/block_cache_trace_analyzer.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/trace_analyzer_tool.cc", "utilities/cassandra/test_utils.cc", ], @@ -369,7 +369,7 @@ cpp_library( name = "rocksdb_tools_lib", srcs = [ "test_util/testutil.cc", - "tools/block_cache_trace_analyzer.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/db_bench_tool.cc", "tools/trace_analyzer_tool.cc", ], @@ -430,7 +430,7 @@ ROCKS_TESTS = [ ], [ "block_cache_trace_analyzer_test", - "tools/block_cache_trace_analyzer_test.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc", "serial", ], [ diff --git a/src.mk b/src.mk index 3462a6a58bb..0c6142e41ad 100644 --- a/src.mk +++ b/src.mk @@ -246,7 +246,7 @@ TOOL_LIB_SOURCES = \ utilities/blob_db/blob_dump_tool.cc \ ANALYZER_LIB_SOURCES = \ - tools/block_cache_trace_analyzer.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer.cc \ tools/trace_analyzer_tool.cc \ MOCK_LIB_SOURCES = \ @@ -374,8 +374,8 @@ MAIN_SOURCES = \ table/table_reader_bench.cc \ table/table_test.cc \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ - tools/block_cache_trace_analyzer_test.cc \ - tools/block_cache_trace_analyzer_tool.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc \ tools/db_bench.cc \ tools/db_bench_tool_test.cc \ tools/db_sanity_test.cc \ diff --git a/tools/block_cache_analyzer/__init__.py b/tools/block_cache_analyzer/__init__.py new file mode 100644 index 00000000000..8dbe96a7850 --- /dev/null +++ b/tools/block_cache_analyzer/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py new file mode 100644 index 00000000000..63e367be5a7 --- /dev/null +++ b/tools/block_cache_analyzer/block_cache_pysim.py @@ -0,0 +1,864 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import gc +import random +import sys +import time +from os import path + +import numpy as np + + +kSampleSize = 16 # The sample size used when performing eviction. +kMicrosInSecond = 1000000 +kSecondsInMinute = 60 +kSecondsInHour = 3600 + + +class TraceRecord: + """ + A trace record represents a block access. + It holds the same struct as BlockCacheTraceRecord in + trace_replay/block_cache_tracer.h + """ + + def __init__( + self, + access_time, + block_id, + block_type, + block_size, + cf_id, + cf_name, + level, + fd, + caller, + no_insert, + get_id, + key_id, + kv_size, + is_hit, + ): + self.access_time = access_time + self.block_id = block_id + self.block_type = block_type + self.block_size = block_size + self.cf_id = cf_id + self.cf_name = cf_name + self.level = level + self.fd = fd + self.caller = caller + if no_insert == 1: + self.no_insert = True + else: + self.no_insert = False + self.get_id = get_id + self.key_id = key_id + self.kv_size = kv_size + if is_hit == 1: + self.is_hit = True + else: + self.is_hit = False + + +class CacheEntry: + """A cache entry stored in the cache.""" + + def __init__(self, value_size, cf_id, level, block_type, access_number): + self.value_size = value_size + self.last_access_number = access_number + self.num_hits = 0 + self.cf_id = 0 + self.level = level + self.block_type = block_type + + def __repr__(self): + """Debug string.""" + return "s={},last={},hits={},cf={},l={},bt={}".format( + self.value_size, + self.last_access_number, + self.num_hits, + self.cf_id, + self.level, + self.block_type, + ) + + +class HashEntry: + """A hash entry stored in a hash table.""" + + def __init__(self, key, hash, value): + self.key = key + self.hash = hash + self.value = value + + def __repr__(self): + return "k={},h={},v=[{}]".format(self.key, self.hash, self.value) + + +class HashTable: + """ + A custom implementation of hash table to support fast random sampling. + It is closed hashing and uses chaining to resolve hash conflicts. + It grows/shrinks the hash table upon insertion/deletion to support + fast lookups and random samplings. + """ + + def __init__(self): + self.table = [None] * 32 + self.elements = 0 + + def random_sample(self, sample_size): + """Randomly sample 'sample_size' hash entries from the table.""" + samples = [] + index = random.randint(0, len(self.table)) + pos = (index + 1) % len(self.table) + searches = 0 + # Starting from index, adding hash entries to the sample list until + # sample_size is met or we ran out of entries. + while pos != index and len(samples) < sample_size: + if self.table[pos] is not None: + for i in range(len(self.table[pos])): + if self.table[pos][i] is None: + continue + samples.append(self.table[pos][i]) + if len(samples) > sample_size: + break + pos += 1 + pos = pos % len(self.table) + searches += 1 + return samples + + def insert(self, key, hash, value): + """ + Insert a hash entry in the table. Replace the old entry if it already + exists. + """ + self.grow() + inserted = False + index = hash % len(self.table) + if self.table[index] is None: + self.table[index] = [] + for i in range(len(self.table[index])): + if self.table[index][i] is not None: + if ( + self.table[index][i].hash == hash + and self.table[index][i].key == key + ): + # The entry already exists in the table. + self.table[index][i] = HashEntry(key, hash, value) + return + continue + self.table[index][i] = HashEntry(key, hash, value) + inserted = True + break + if not inserted: + self.table[index].append(HashEntry(key, hash, value)) + self.elements += 1 + + def resize(self, new_size): + if new_size == len(self.table): + return + if new_size == 0: + return + if self.elements < 100: + return + new_table = [None] * new_size + # Copy 'self.table' to new_table. + for i in range(len(self.table)): + entries = self.table[i] + if entries is None: + continue + for j in range(len(entries)): + if entries[j] is None: + continue + index = entries[j].hash % new_size + if new_table[index] is None: + new_table[index] = [] + new_table[index].append(entries[j]) + self.table = new_table + del new_table + # Manually call python gc here to free the memory as 'self.table' + # might be very large. + gc.collect() + + def grow(self): + if self.elements < len(self.table): + return + new_size = int(len(self.table) * 1.2) + self.resize(new_size) + + def delete(self, key, hash): + index = hash % len(self.table) + entries = self.table[index] + deleted = False + if entries is None: + return + for i in range(len(entries)): + if ( + entries[i] is not None + and entries[i].hash == hash + and entries[i].key == key + ): + entries[i] = None + self.elements -= 1 + deleted = True + break + if deleted: + self.shrink() + + def shrink(self): + if self.elements * 2 >= len(self.table): + return + new_size = int(len(self.table) * 0.7) + self.resize(new_size) + + def lookup(self, key, hash): + index = hash % len(self.table) + entries = self.table[index] + if entries is None: + return None + for entry in entries: + if entry is not None and entry.hash == hash and entry.key == key: + return entry.value + return None + + +class MissRatioStats: + def __init__(self, time_unit): + self.num_misses = 0 + self.num_accesses = 0 + self.time_unit = time_unit + self.time_misses = {} + self.time_accesses = {} + + def update_metrics(self, access_time, is_hit): + access_time /= kMicrosInSecond * self.time_unit + self.num_accesses += 1 + if access_time not in self.time_accesses: + self.time_accesses[access_time] = 0 + self.time_accesses[access_time] += 1 + if not is_hit: + self.num_misses += 1 + if access_time not in self.time_misses: + self.time_misses[access_time] = 0 + self.time_misses[access_time] += 1 + + def reset_counter(self): + self.num_misses = 0 + self.num_accesses = 0 + + def miss_ratio(self): + return float(self.num_misses) * 100.0 / float(self.num_accesses) + + def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-miss-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + with open(file_path, "w+") as file: + row = "{}".format(cache_type) + for trace_time in range(start, end): + row += ",{}".format(self.time_misses.get(trace_time, 0)) + file.write(row + "\n") + + def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, end): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + with open(file_path, "w+") as file: + row = "{}".format(cache_type) + for trace_time in range(start, end): + naccesses = self.time_accesses.get(trace_time, 0) + miss_ratio = 0 + if naccesses > 0: + miss_ratio = float( + self.time_misses.get(trace_time, 0) * 100.0 + ) / float(naccesses) + row += ",{0:.2f}".format(miss_ratio) + file.write(row + "\n") + + +class PolicyStats: + def __init__(self, time_unit, policies): + self.time_selected_polices = {} + self.time_accesses = {} + self.policy_names = {} + self.time_unit = time_unit + for i in range(len(policies)): + self.policy_names[i] = policies[i].policy_name() + + def update_metrics(self, access_time, selected_policy): + access_time /= kMicrosInSecond * self.time_unit + if access_time not in self.time_accesses: + self.time_accesses[access_time] = 0 + self.time_accesses[access_time] += 1 + if access_time not in self.time_selected_polices: + self.time_selected_polices[access_time] = {} + policy_name = self.policy_names[selected_policy] + if policy_name not in self.time_selected_polices[access_time]: + self.time_selected_polices[access_time][policy_name] = 0 + self.time_selected_polices[access_time][policy_name] += 1 + + def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-policy-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + with open(file_path, "w+") as file: + for policy in self.policy_names: + policy_name = self.policy_names[policy] + row = "{}-{}".format(cache_type, policy_name) + for trace_time in range(start, end): + row += ",{}".format( + self.time_selected_polices.get(trace_time, {}).get( + policy_name, 0 + ) + ) + file.write(row + "\n") + + def write_policy_ratio_timeline( + self, cache_type, cache_size, file_path, start, end + ): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size + ) + with open(file_path, "w+") as file: + for policy in self.policy_names: + policy_name = self.policy_names[policy] + row = "{}-{}".format(cache_type, policy_name) + for trace_time in range(start, end): + naccesses = self.time_accesses.get(trace_time, 0) + ratio = 0 + if naccesses > 0: + ratio = float( + self.time_selected_polices.get(trace_time, {}).get( + policy_name, 0 + ) + * 100.0 + ) / float(naccesses) + row += ",{0:.2f}".format(ratio) + file.write(row + "\n") + + +class Policy(object): + """ + A policy maintains a set of evicted keys. It returns a reward of one to + itself if it has not evicted a missing key. Otherwise, it gives itself 0 + reward. + """ + + def __init__(self): + self.evicted_keys = {} + + def evict(self, key, max_size): + self.evicted_keys[key] = 0 + + def delete(self, key): + self.evicted_keys.pop(key, None) + + def prioritize_samples(self, samples): + raise NotImplementedError + + def policy_name(self): + raise NotImplementedError + + def generate_reward(self, key): + if key in self.evicted_keys: + return 0 + return 1 + + +class LRUPolicy(Policy): + def prioritize_samples(self, samples): + return sorted( + samples, + cmp=lambda e1, e2: e1.value.last_access_number + - e2.value.last_access_number, + ) + + def policy_name(self): + return "lru" + + +class MRUPolicy(Policy): + def prioritize_samples(self, samples): + return sorted( + samples, + cmp=lambda e1, e2: e2.value.last_access_number + - e1.value.last_access_number, + ) + + def policy_name(self): + return "mru" + + +class LFUPolicy(Policy): + def prioritize_samples(self, samples): + return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits) + + def policy_name(self): + return "lfu" + + +class MLCache(object): + def __init__(self, cache_size, enable_cache_row_key, policies): + self.cache_size = cache_size + self.used_size = 0 + self.miss_ratio_stats = MissRatioStats(kSecondsInMinute) + self.policy_stats = PolicyStats(kSecondsInMinute, policies) + self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour) + self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies) + self.table = HashTable() + self.enable_cache_row_key = enable_cache_row_key + self.get_id_row_key_map = {} + self.policies = policies + + def _lookup(self, key, hash): + value = self.table.lookup(key, hash) + if value is not None: + value.last_access_number = self.miss_ratio_stats.num_accesses + value.num_hits += 1 + return True + return False + + def _select_policy(self, trace_record, key): + raise NotImplementedError + + def cache_name(self): + raise NotImplementedError + + def _evict(self, policy_index, value_size): + # Randomly sample n entries. + samples = self.table.random_sample(kSampleSize) + samples = self.policies[policy_index].prioritize_samples(samples) + for hash_entry in samples: + self.used_size -= hash_entry.value.value_size + self.table.delete(hash_entry.key, hash_entry.hash) + self.policies[policy_index].evict( + key=hash_entry.key, max_size=self.table.elements + ) + if self.used_size + value_size <= self.cache_size: + break + + def _insert(self, trace_record, key, hash, value_size): + if value_size > self.cache_size: + return + policy_index = self._select_policy(trace_record, key) + self.policies[policy_index].delete(key) + self.policy_stats.update_metrics(trace_record.access_time, policy_index) + self.per_hour_policy_stats.update_metrics( + trace_record.access_time, policy_index + ) + while self.used_size + value_size > self.cache_size: + self._evict(policy_index, value_size) + self.table.insert( + key, + hash, + CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + self.miss_ratio_stats.num_accesses, + ), + ) + self.used_size += value_size + + def _access_kv(self, trace_record, key, hash, value_size, no_insert): + if self._lookup(key, hash): + return True + if not no_insert and value_size > 0: + self._insert(trace_record, key, hash, value_size) + return False + + def _update_stats(self, access_time, is_hit): + self.miss_ratio_stats.update_metrics(access_time, is_hit) + self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit) + + def access(self, trace_record): + assert self.used_size <= self.cache_size + if ( + self.enable_cache_row_key + and trace_record.caller == 1 + and trace_record.key_id != 0 + and trace_record.get_id != 0 + ): + # This is a get request. + if trace_record.get_id not in self.get_id_row_key_map: + self.get_id_row_key_map[trace_record.get_id] = {} + self.get_id_row_key_map[trace_record.get_id]["h"] = False + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + self._update_stats(trace_record.access_time, is_hit=True) + return + if trace_record.key_id not in self.get_id_row_key_map[trace_record.get_id]: + # First time seen this key. + is_hit = self._access_kv( + trace_record, + key="g{}".format(trace_record.key_id), + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, + ) + inserted = False + if trace_record.kv_size > 0: + inserted = True + self.get_id_row_key_map[trace_record.get_id][ + trace_record.key_id + ] = inserted + self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + self._update_stats(trace_record.access_time, is_hit=True) + return + # Access its blocks. + is_hit = self._access_kv( + trace_record, + key="b{}".format(trace_record.block_id), + hash=trace_record.block_id, + value_size=trace_record.block_size, + no_insert=trace_record.no_insert, + ) + self._update_stats(trace_record.access_time, is_hit) + if ( + trace_record.kv_size > 0 + and not self.get_id_row_key_map[trace_record.get_id][ + trace_record.key_id + ] + ): + # Insert the row key-value pair. + self._access_kv( + trace_record, + key="g{}".format(trace_record.key_id), + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, + ) + # Mark as inserted. + self.get_id_row_key_map[trace_record.get_id][trace_record.key_id] = True + return + # Access the block. + is_hit = self._access_kv( + trace_record, + key="b{}".format(trace_record.block_id), + hash=trace_record.block_id, + value_size=trace_record.block_size, + no_insert=trace_record.no_insert, + ) + self._update_stats(trace_record.access_time, is_hit) + + +class ThompsonSamplingCache(MLCache): + """ + An implementation of Thompson Sampling for the Bernoulli Bandit [1]. + [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, + and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. + Trends Mach. Learn. 11, 1 (July 2018), 1-96. + DOI: https://doi.org/10.1561/2200000070 + """ + + def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b=1): + super(ThompsonSamplingCache, self).__init__( + cache_size, enable_cache_row_key, policies + ) + self._as = {} + self._bs = {} + for _i in range(len(policies)): + self._as = [init_a] * len(self.policies) + self._bs = [init_b] * len(self.policies) + + def _select_policy(self, trace_record, key): + samples = [ + np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies)) + ] + selected_policy = max(range(len(self.policies)), key=lambda x: samples[x]) + reward = self.policies[selected_policy].generate_reward(key) + assert reward <= 1 and reward >= 0 + self._as[selected_policy] += reward + self._bs[selected_policy] += 1 - reward + return selected_policy + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid ThompsonSampling (ts_hybrid)" + return "ThompsonSampling (ts)" + + +class LinUCBCache(MLCache): + """ + An implementation of LinUCB with disjoint linear models [2]. + [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. + A contextual-bandit approach to personalized news article recommendation. + In Proceedings of the 19th international conference on World wide web + (WWW '10). ACM, New York, NY, USA, 661-670. + DOI=http://dx.doi.org/10.1145/1772690.1772758 + """ + + def __init__(self, cache_size, enable_cache_row_key, policies): + super(LinUCBCache, self).__init__(cache_size, enable_cache_row_key, policies) + self.nfeatures = 4 # Block type, caller, level, cf. + self.th = np.zeros((len(self.policies), self.nfeatures)) + self.eps = 0.2 + self.b = np.zeros_like(self.th) + self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures)) + self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures)) + for i in range(len(self.policies)): + self.A[i] = np.identity(self.nfeatures) + self.th_hat = np.zeros_like(self.th) + self.p = np.zeros(len(self.policies)) + self.alph = 0.2 + + def _select_policy(self, trace_record, key): + x_i = np.zeros(self.nfeatures) # The current context vector + x_i[0] = trace_record.block_type + x_i[1] = trace_record.caller + x_i[2] = trace_record.level + x_i[3] = trace_record.cf_id + p = np.zeros(len(self.policies)) + for a in range(len(self.policies)): + self.th_hat[a] = self.A_inv[a].dot(self.b[a]) + ta = x_i.dot(self.A_inv[a]).dot(x_i) + a_upper_ci = self.alph * np.sqrt(ta) + a_mean = self.th_hat[a].dot(x_i) + p[a] = a_mean + a_upper_ci + p = p + (np.random.random(len(p)) * 0.000001) + selected_policy = p.argmax() + reward = self.policies[selected_policy].generate_reward(key) + assert reward <= 1 and reward >= 0 + self.A[selected_policy] += np.outer(x_i, x_i) + self.b[selected_policy] += reward * x_i + self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy]) + del x_i + return selected_policy + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid LinUCB (linucb_hybrid)" + return "LinUCB (linucb)" + + +def parse_cache_size(cs): + cs = cs.replace("\n", "") + if cs[-1] == "M": + return int(cs[: len(cs) - 1]) * 1024 * 1024 + if cs[-1] == "G": + return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 + if cs[-1] == "T": + return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024 + return int(cs) + + +def create_cache(cache_type, cache_size, downsample_size): + policies = [] + policies.append(LRUPolicy()) + policies.append(MRUPolicy()) + policies.append(LFUPolicy()) + cache_size = cache_size / downsample_size + enable_cache_row_key = False + if "hybrid" in cache_type: + enable_cache_row_key = True + cache_type = cache_type[:-7] + if cache_type == "ts": + return ThompsonSamplingCache(cache_size, enable_cache_row_key, policies) + elif cache_type == "linucb": + return LinUCBCache(cache_size, enable_cache_row_key, policies) + else: + print("Unknown cache type {}".format(cache_type)) + assert False + return None + + +def run(trace_file_path, cache_type, cache, warmup_seconds): + warmup_complete = False + num = 0 + trace_start_time = 0 + trace_duration = 0 + start_time = time.time() + time_interval = 1 + trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute) + with open(trace_file_path, "r") as trace_file: + for line in trace_file: + num += 1 + if num % 1000000 == 0: + # Force a python gc periodically to reduce memory usage. + gc.collect() + ts = line.split(",") + timestamp = int(ts[0]) + if trace_start_time == 0: + trace_start_time = timestamp + trace_duration = timestamp - trace_start_time + if not warmup_complete and trace_duration > warmup_seconds * 1000000: + cache.miss_ratio_stats.reset_counter() + warmup_complete = True + record = TraceRecord( + access_time=int(ts[0]), + block_id=int(ts[1]), + block_type=int(ts[2]), + block_size=int(ts[3]), + cf_id=int(ts[4]), + cf_name=ts[5], + level=int(ts[6]), + fd=int(ts[7]), + caller=int(ts[8]), + no_insert=int(ts[9]), + get_id=int(ts[10]), + key_id=int(ts[11]), + kv_size=int(ts[12]), + is_hit=int(ts[13]), + ) + trace_miss_ratio_stats.update_metrics( + record.access_time, is_hit=record.is_hit + ) + cache.access(record) + del record + if num % 100 != 0: + continue + # Report progress every 10 seconds. + now = time.time() + if now - start_time > time_interval * 10: + print( + "Take {} seconds to process {} trace records with trace " + "duration of {} seconds. Throughput: {} records/second. " + "Trace miss ratio {}".format( + now - start_time, + num, + trace_duration / 1000000, + num / (now - start_time), + trace_miss_ratio_stats.miss_ratio(), + ) + ) + time_interval += 1 + print( + "{},0,0,{},{},{}".format( + cache_type, + cache.cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) + now = time.time() + print( + "Take {} seconds to process {} trace records with trace duration of {} " + "seconds. Throughput: {} records/second. Trace miss ratio {}".format( + now - start_time, + num, + trace_duration / 1000000, + num / (now - start_time), + trace_miss_ratio_stats.miss_ratio(), + ) + ) + return trace_start_time, trace_duration + + +def report_stats( + cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time +): + cache_label = "{}-{}".format(cache_type, cache_size) + with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file: + mrc_file.write( + "{},0,0,{},{},{}\n".format( + cache_type, + cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) + cache.policy_stats.write_policy_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.policy_stats.write_policy_ratio_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.miss_ratio_stats.write_miss_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.miss_ratio_stats.write_miss_ratio_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.per_hour_policy_stats.write_policy_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.per_hour_policy_stats.write_policy_ratio_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.per_hour_miss_ratio_stats.write_miss_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + cache.per_hour_miss_ratio_stats.write_miss_ratio_timeline( + cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) + + +if __name__ == "__main__": + if len(sys.argv) <= 6: + print( + "Must provide 6 arguments. " + "1) cache_type (ts, ts_hybrid, linucb, linucb_hybrid). " + "2) cache size (xM, xG, xT). " + "3) The sampling frequency used to collect the trace. (The " + "simulation scales down the cache size by the sampling frequency). " + "4) Warmup seconds (The number of seconds used for warmup). " + "5) Trace file path. " + "6) Result directory (A directory that saves generated results)" + ) + exit(1) + cache_type = sys.argv[1] + cache_size = parse_cache_size(sys.argv[2]) + downsample_size = int(sys.argv[3]) + warmup_seconds = int(sys.argv[4]) + trace_file_path = sys.argv[5] + result_dir = sys.argv[6] + cache = create_cache(cache_type, cache_size, downsample_size) + trace_start_time, trace_duration = run( + trace_file_path, cache_type, cache, warmup_seconds + ) + trace_end_time = trace_start_time + trace_duration + report_stats( + cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time + ) diff --git a/tools/block_cache_analyzer/block_cache_pysim.sh b/tools/block_cache_analyzer/block_cache_pysim.sh new file mode 100644 index 00000000000..58193a0635a --- /dev/null +++ b/tools/block_cache_analyzer/block_cache_pysim.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to run a batch of pysims and combine individual pysim output files. +# +# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs +# trace_file_path: The file path that stores the traces. +# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml +# downsample_size: The downsample size used to collect the trace. +# warmup_seconds: The number of seconds used for warmup. +# max_jobs: The max number of concurrent pysims to run. + +if [ $# -ne 5 ]; then + echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs" + exit 0 +fi + +trace_file="$1" +result_dir="$2" +downsample_size="$3" +warmup_seconds="$4" +max_jobs="$5" +current_jobs=0 + +ml_tmp_result_dir="$result_dir/ml" +rm -rf "$ml_tmp_result_dir" +mkdir -p "$result_dir" +mkdir -p "$ml_tmp_result_dir" + +for cache_type in "ts" "linucb" "ts_hybrid" "linucb_hybrid" +do +for cache_size in "16M" "256M" "1G" "2G" "4G" "8G" "12G" "16G" +do + while [ "$current_jobs" -ge "$max_jobs" ] + do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + done + output="log-ml-$cache_type-$cache_size" + echo "Running simulation for $cache_type and cache size $cache_size. Number of running jobs: $current_jobs. " + nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" >& $ml_tmp_result_dir/$output & + current_jobs=$((current_jobs+1)) +done +done + +# Wait for all jobs to complete. +while [ $current_jobs -gt 0 ] +do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" +done + +echo "Combine individual pysim output files" + +rm -rf "$result_dir/ml_*" +mrc_file="$result_dir/ml_mrc" +for header in "header-" "data-" +do +for fn in $ml_tmp_result_dir/* +do + sum_file="" + time_unit="" + capacity="" + if [[ $fn == *"timeline"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit_index=0 + capacity_index=0 + for i in "${elements[@]}" + do + if [[ $i == "timeline" ]]; then + break + fi + time_unit_index=$((time_unit_index+1)) + done + time_unit_index=$((time_unit_index+1)) + capacity_index=$((time_unit_index+2)) + time_unit="${elements[$time_unit_index]}_" + capacity="${elements[$capacity_index]}_" + fi + + if [[ $fn == "${header}ml-policy-timeline"* ]]; then + sum_file="$result_dir/ml_${capacity}${time_unit}policy_timeline" + fi + if [[ $fn == "${header}ml-policy-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${capacity}${time_unit}policy_ratio_timeline" + fi + if [[ $fn == "${header}ml-miss-timeline"* ]]; then + sum_file="$result_dir/ml_${capacity}${time_unit}miss_timeline" + fi + if [[ $fn == "${header}ml-miss-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${capacity}${time_unit}miss_ratio_timeline" + fi + if [[ $fn == "${header}ml-mrc"* ]]; then + sum_file="$mrc_file" + fi + if [[ $sum_file == "" ]]; then + continue + fi + if [[ $header == "header-" ]]; then + if [ -e "$sum_file" ]; then + continue + fi + fi + cat "$ml_tmp_result_dir/$fn" >> "$sum_file" +done +done + +echo "Done" +# Sort MRC file by cache_type and cache_size. +tmp_file="$result_dir/tmp_mrc" +cat "$mrc_file" | sort -t ',' -k1,1 -k4,4n > "$tmp_file" +cat "$tmp_file" > "$mrc_file" +rm -rf "$tmp_file" diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py new file mode 100644 index 00000000000..e298d7bbd6f --- /dev/null +++ b/tools/block_cache_analyzer/block_cache_pysim_test.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import random + +from block_cache_pysim import ( + HashTable, + LFUPolicy, + LinUCBCache, + LRUPolicy, + MRUPolicy, + ThompsonSamplingCache, + TraceRecord, + kSampleSize, +) + + +def test_hash_table(): + print("Test hash table") + table = HashTable() + data_size = 10000 + for i in range(data_size): + table.insert("k{}".format(i), i, "v{}".format(i)) + for i in range(data_size): + assert table.lookup("k{}".format(i), i) is not None + for i in range(data_size): + table.delete("k{}".format(i), i) + for i in range(data_size): + assert table.lookup("k{}".format(i), i) is None + + truth_map = {} + n = 1000000 + records = 100 + for i in range(n): + key_id = random.randint(0, records) + key = "k{}".format(key_id) + value = "v{}".format(key_id) + action = random.randint(0, 2) + # print "{}:{}:{}".format(action, key, value) + assert len(truth_map) == table.elements, "{} {} {}".format( + len(truth_map), table.elements, i + ) + if action == 0: + table.insert(key, key_id, value) + truth_map[key] = value + elif action == 1: + if key in truth_map: + assert table.lookup(key, key_id) is not None + assert truth_map[key] == table.lookup(key, key_id) + else: + assert table.lookup(key, key_id) is None + else: + table.delete(key, key_id) + if key in truth_map: + del truth_map[key] + print("Test hash table: Success") + + +def assert_metrics(cache, expected_value): + assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format( + expected_value[0], cache.used_size + ) + assert ( + cache.miss_ratio_stats.num_accesses == expected_value[1] + ), "Expected {}, Actual {}".format( + expected_value[1], cache.miss_ratio_stats.num_accesses + ) + assert ( + cache.miss_ratio_stats.num_misses == expected_value[2] + ), "Expected {}, Actual {}".format( + expected_value[2], cache.miss_ratio_stats.num_misses + ) + assert cache.table.elements == len(expected_value[3]) + len( + expected_value[4] + ), "Expected {}, Actual {}".format( + len(expected_value[3]) + len(expected_value[4]), cache.table.elements + ) + for expeceted_k in expected_value[3]: + val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k) + assert val is not None + assert val.value_size == 1 + for expeceted_k in expected_value[4]: + val = cache.table.lookup("g{}".format(expeceted_k), expeceted_k) + assert val is not None + assert val.value_size == 1 + + +# Access k1, k1, k2, k3, k3, k3, k4 +def test_cache(policies, expected_value): + cache = ThompsonSamplingCache(3, False, policies) + k1 = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + ) + k2 = TraceRecord( + access_time=1, + block_id=2, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + ) + k3 = TraceRecord( + access_time=2, + block_id=3, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + ) + k4 = TraceRecord( + access_time=3, + block_id=4, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + ) + sequence = [k1, k1, k2, k3, k3, k3] + index = 0 + expected_values = [] + # Access k1, miss. + expected_values.append([1, 1, 1, [1], []]) + # Access k1, hit. + expected_values.append([1, 2, 1, [1], []]) + # Access k2, miss. + expected_values.append([2, 3, 2, [1, 2], []]) + # Access k3, miss. + expected_values.append([3, 4, 3, [1, 2, 3], []]) + # Access k3, hit. + expected_values.append([3, 5, 3, [1, 2, 3], []]) + # Access k3, hit. + expected_values.append([3, 6, 3, [1, 2, 3], []]) + for access in sequence: + cache.access(access) + assert_metrics(cache, expected_values[index]) + index += 1 + cache.access(k4) + assert_metrics(cache, expected_value) + + +def test_lru_cache(): + print("Test LRU cache") + policies = [] + policies.append(LRUPolicy()) + # Access k4, miss. evict k1 + test_cache(policies, [3, 7, 4, [2, 3, 4], []]) + print("Test LRU cache: Success") + + +def test_mru_cache(): + print("Test MRU cache") + policies = [] + policies.append(MRUPolicy()) + # Access k4, miss. evict k3 + test_cache(policies, [3, 7, 4, [1, 2, 4], []]) + print("Test MRU cache: Success") + + +def test_lfu_cache(): + print("Test LFU cache") + policies = [] + policies.append(LFUPolicy()) + # Access k4, miss. evict k2 + test_cache(policies, [3, 7, 4, [1, 3, 4], []]) + print("Test LFU cache: Success") + + +def test_mix(cache): + print("Test Mix {} cache".format(cache.cache_name())) + n = 100000 + records = 199 + for i in range(n): + key_id = random.randint(0, records) + vs = random.randint(0, 10) + k = TraceRecord( + access_time=i, + block_id=key_id, + block_type=1, + block_size=vs, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=key_id, + key_id=key_id, + kv_size=5, + is_hit=1, + ) + cache.access(k) + assert cache.miss_ratio_stats.miss_ratio() > 0 + print("Test Mix {} cache: Success".format(cache.cache_name())) + + +def test_hybrid(cache): + print("Test {} cache".format(cache.cache_name())) + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, # the first get request. + key_id=1, + kv_size=0, # no size. + is_hit=1, + ) + cache.access(k) # Expect a miss. + # used size, num accesses, num misses, hash table size, blocks, get keys. + assert_metrics(cache, [1, 1, 1, [1], []]) + k.access_time += 1 + k.kv_size = 1 + k.block_id = 2 + cache.access(k) # k should be inserted. + assert_metrics(cache, [3, 2, 2, [1, 2], [1]]) + k.access_time += 1 + k.block_id = 3 + cache.access(k) # k should not be inserted again. + assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]]) + # A second get request referencing the same key. + k.access_time += 1 + k.get_id = 2 + k.block_id = 4 + k.kv_size = 0 + cache.access(k) # k should observe a hit. No block access. + assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]]) + + # A third get request searches three files, three different keys. + # And the second key observes a hit. + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 3 + k.key_id = 2 + cache.access(k) # k should observe a miss. block 3 observes a hit. + assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]]) + + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 4 + k.kv_size = 1 + k.key_id = 1 + cache.access(k) # k1 should observe a hit. + assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]]) + + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 4 + k.kv_size = 1 + k.key_id = 3 + # k3 should observe a miss. + # However, as the get already complete, we should not access k3 any more. + cache.access(k) + assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]]) + + # A fourth get request searches one file and two blocks. One row key. + k.access_time += 1 + k.get_id = 4 + k.block_id = 5 + k.key_id = 4 + k.kv_size = 1 + cache.access(k) + assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]]) + + # A bunch of insertions which evict cached row keys. + for i in range(6, 100): + k.access_time += 1 + k.get_id = 0 + k.block_id = i + cache.access(k) + + k.get_id = 4 + k.block_id = 100 # A different block. + k.key_id = 4 # Same row key and should not be inserted again. + k.kv_size = 1 + cache.access(k) + assert_metrics(cache, [16, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]) + print("Test {} cache: Success".format(cache.cache_name())) + + +if __name__ == "__main__": + policies = [] + policies.append(MRUPolicy()) + policies.append(LRUPolicy()) + policies.append(LFUPolicy()) + test_hash_table() + test_lru_cache() + test_mru_cache() + test_lfu_cache() + test_mix(ThompsonSamplingCache(100, False, policies)) + test_mix(ThompsonSamplingCache(100, True, policies)) + test_mix(LinUCBCache(100, False, policies)) + test_mix(LinUCBCache(100, True, policies)) + test_hybrid(ThompsonSamplingCache(kSampleSize, True, [LRUPolicy()])) + test_hybrid(LinUCBCache(kSampleSize, True, [LRUPolicy()])) diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc similarity index 99% rename from tools/block_cache_trace_analyzer.cc rename to tools/block_cache_analyzer/block_cache_trace_analyzer.cc index 761395a6654..032ed2be24f 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE #ifdef GFLAGS -#include "tools/block_cache_trace_analyzer.h" +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" #include #include @@ -1395,13 +1395,12 @@ Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord( } int ret = snprintf( trace_record_buffer_, sizeof(trace_record_buffer_), - "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%" PRIu32 ",%" PRIu64 - "" - ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n", + "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32 + ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n", access.access_timestamp, block_id, access.block_type, access.block_size, - access.cf_id, access.level, access.sst_fd_number, access.caller, - access.no_insert, access.get_id, get_key_id, access.referenced_data_size, - access.is_cache_hit); + access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number, + access.caller, access.no_insert, access.get_id, get_key_id, + access.referenced_data_size, access.is_cache_hit); if (ret < 0) { return Status::IOError("failed to format the output"); } @@ -2134,6 +2133,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.WriteAccessTimeline(label, kSecondInHour, false); } else { analyzer.WriteAccessTimeline(label, kSecondInMinute, false); + analyzer.WriteAccessTimeline(label, kSecondInHour, false); } } } diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h similarity index 100% rename from tools/block_cache_trace_analyzer.h rename to tools/block_cache_analyzer/block_cache_trace_analyzer.h diff --git a/tools/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py similarity index 100% rename from tools/block_cache_trace_analyzer_plot.py rename to tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc similarity index 99% rename from tools/block_cache_trace_analyzer_test.cc rename to tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index a028bf197c9..9917d5b9e78 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -23,7 +23,7 @@ int main() { #include "rocksdb/trace_reader_writer.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "tools/block_cache_trace_analyzer.h" +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" #include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -343,7 +343,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { std::string l; ASSERT_TRUE(getline(ss, l, ',')); if (l.find("block") == std::string::npos) { - if (unit != "_60" || user_access_only != "all_access_") { + if (user_access_only != "all_access_") { continue; } } diff --git a/tools/block_cache_trace_analyzer_tool.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc similarity index 91% rename from tools/block_cache_trace_analyzer_tool.cc rename to tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc index b7b36c5d241..63382cf8c22 100644 --- a/tools/block_cache_trace_analyzer_tool.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc @@ -11,7 +11,7 @@ int main() { return 1; } #else // GFLAGS -#include "tools/block_cache_trace_analyzer.h" +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" int main(int argc, char** argv) { return rocksdb::block_cache_trace_analyzer_tool(argc, argv); } From 3f89af1c39da4991ef6c544fc5e3f164a688b375 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Fri, 26 Jul 2019 15:48:35 -0700 Subject: [PATCH 263/572] Reduce the number of random iterations in compact_on_deletion_collector_test (#5635) Summary: This test frequently times out under TSAN; reducing the number of random iterations to make it complete faster. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5635 Test Plan: buck test mode/dev-tsan internal_repo_rocksdb/repo:compact_on_deletion_collector_test Differential Revision: D16523505 Pulled By: ltamasi fbshipit-source-id: 6a69909bce9d204c891150fcb3d536547b3253d0 --- .../compact_on_deletion_collector_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 101aa988b66..57eed107011 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -40,7 +40,7 @@ int main(int /*argc*/, char** /*argv*/) { // randomize tests rocksdb::Random rnd(301); const int kMaxTestSize = 100000l; - for (int random_test = 0; random_test < 30; random_test++) { + for (int random_test = 0; random_test < 10; random_test++) { int window_size = rnd.Uniform(kMaxTestSize) + 1; int deletion_trigger = rnd.Uniform(window_size); window_sizes.emplace_back(window_size); From 80d7067cb2e1d675104fac9e7d5e52b3aa56aa3b Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Fri, 26 Jul 2019 16:28:38 -0700 Subject: [PATCH 264/572] Use int64_t instead of ssize_t (#5638) Summary: The ssize_t type was introduced in https://github.com/facebook/rocksdb/pull/5633, but it seems like it's a POSIX specific type. I just need a signed type to represent number of bytes, so use int64_t instead. It seems like we have a typedef from SSIZE_T for Windows, but it doesn't seem like we ever include "port/port.h" in our public header files. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5638 Differential Revision: D16526269 Pulled By: lth fbshipit-source-id: 8d3a5c41003951b74b29bc5f1d949b2b22da0cee --- include/rocksdb/utilities/transaction_db.h | 4 ++-- utilities/transactions/write_unprepared_txn.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 33826bab861..7798e63da7b 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -104,7 +104,7 @@ struct TransactionDBOptions { // This option is only valid for write unprepared. If a write batch exceeds // this threshold, then the transaction will implicitly flush the currently // pending writes into the database. A value of 0 or less means no limit. - ssize_t default_write_batch_flush_threshold = 0; + int64_t default_write_batch_flush_threshold = 0; private: // 128 entries @@ -171,7 +171,7 @@ struct TransactionOptions { // See TransactionDBOptions::default_write_batch_flush_threshold for // description. If a negative value is specified, then the default value from // TransactionDBOptions is used. - ssize_t write_batch_flush_threshold = -1; + int64_t write_batch_flush_threshold = -1; }; // The per-write optimizations that do not involve transactions. TransactionDB diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index feac749ee82..bc952544ab0 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -167,7 +167,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // write_batch_flush_threshold_ has been exceeded, and then call // FlushWriteBatchToDB if so. This logic is encapsulated in // MaybeFlushWriteBatchToDB. - ssize_t write_batch_flush_threshold_; + int64_t write_batch_flush_threshold_; WriteUnpreparedTxnDB* wupt_db_; // Ordered list of unprep_seq sequence numbers that we have already written From e648c1d9eb093e6cbb2ed500b17915a43c5aa172 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 29 Jul 2019 10:52:32 -0700 Subject: [PATCH 265/572] Cache simulator: Optimize hybrid row-block cache. (#5616) Summary: This PR optimizes the hybrid row-block cache simulator. If a Get request hits the cache, we treat all its future accesses as hits. Consider a Get request (no snapshot) accesses multiple files, e.g, file1, file2, file3. We construct the row key as "fdnumber_key_0". Before this PR, if it hits the cache when searching the key in file1, we continue to process its accesses in file2 and file3 which is unnecessary. With this PR, if "file1_key_0" is in the cache, we treat all future accesses of this Get request as hits. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5616 Differential Revision: D16453187 Pulled By: HaoyuHuang fbshipit-source-id: 56f3169cc322322305baaf5543226a0824fae19f --- utilities/simulator_cache/cache_simulator.cc | 45 +++--- utilities/simulator_cache/cache_simulator.h | 19 ++- .../simulator_cache/cache_simulator_test.cc | 149 +++++++++++++++++- 3 files changed, 186 insertions(+), 27 deletions(-) diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index 06de4c11996..98a5c8a695f 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -122,14 +122,26 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { // TODO (haoyu): We only support Get for now. We need to extend the tracing // for MultiGet, i.e., non-data block accesses must log all keys in a // MultiGet. - bool is_cache_miss = false; + bool is_cache_miss = true; bool admitted = false; if (access.caller == TableReaderCaller::kUserGet && access.get_id != BlockCacheTraceHelper::kReservedGetId) { - // This is a Get/MultiGet request. + // This is a Get request. const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access); - if (getid_getkeys_map_[access.get_id].find(row_key) == - getid_getkeys_map_[access.get_id].end()) { + GetRequestStatus& status = getid_status_map_[access.get_id]; + if (status.is_complete) { + // This Get request completes. + // Skip future accesses to its index/filter/data + // blocks. These block lookups are unnecessary if we observe a hit for the + // referenced key-value pair already. Thus, we treat these lookups as + // hits. This is also to ensure the total number of accesses are the same + // when comparing to other policies. + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); + return; + } + if (status.row_key_status.find(row_key) == status.row_key_status.end()) { // This is the first time that this key is accessed. Look up the key-value // pair first. Do not update the miss/accesses metrics here since it will // be updated later. @@ -144,37 +156,30 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { } else if (admitted) { result = InsertResult::ADMITTED; } - getid_getkeys_map_[access.get_id][row_key] = - std::make_pair(is_cache_miss, result); + status.row_key_status[row_key] = result; } - std::pair miss_inserted = - getid_getkeys_map_[access.get_id][row_key]; - if (!miss_inserted.first) { - // This is a cache hit. Skip future accesses to its index/filter/data - // blocks. These block lookups are unnecessary if we observe a hit for the - // referenced key-value pair already. Thus, we treat these lookups as - // hits. This is also to ensure the total number of accesses are the same - // when comparing to other policies. + if (!is_cache_miss) { + // A cache hit. + status.is_complete = true; miss_ratio_stats_.UpdateMetrics(access.access_timestamp, /*is_user_access=*/true, /*is_cache_miss=*/false); return; } - // The key-value pair observes a cache miss. We need to access its + // The row key-value pair observes a cache miss. We need to access its // index/filter/data blocks. + InsertResult inserted = status.row_key_status[row_key]; AccessKVPair( - access.block_key, access.block_type, ComputeBlockPriority(access), + access.block_key, access.block_size, ComputeBlockPriority(access), access, /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert, /*is_user_access=*/true, &is_cache_miss, &admitted, /*update_metrics=*/true); - if (access.referenced_data_size > 0 && - miss_inserted.second == InsertResult::ADMITTED) { + if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) { sim_cache_->Insert(row_key, /*value=*/nullptr, access.referenced_data_size, /*deleter=*/nullptr, /*handle=*/nullptr, Cache::Priority::HIGH); - getid_getkeys_map_[access.get_id][row_key] = - std::make_pair(true, InsertResult::INSERTED); + status.row_key_status[row_key] = InsertResult::INSERTED; } return; } diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h index 3863fcf88dd..6f2a7e84d2b 100644 --- a/utilities/simulator_cache/cache_simulator.h +++ b/utilities/simulator_cache/cache_simulator.h @@ -47,6 +47,7 @@ class MissRatioStats { return static_cast(num_misses_ * 100.0 / num_accesses_); } uint64_t total_accesses() const { return num_accesses_; } + uint64_t total_misses() const { return num_misses_; } const std::map& num_accesses_timeline() const { return num_accesses_timeline_; @@ -63,6 +64,7 @@ class MissRatioStats { return static_cast(user_misses_ * 100.0 / user_accesses_); } uint64_t user_accesses() const { return user_accesses_; } + uint64_t user_misses() const { return user_misses_; } void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access, bool is_cache_miss); @@ -168,17 +170,24 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { NO_INSERT, }; - // A map stores get_id to a map of row keys. For each row key, it stores a - // boolean and an enum. The first bool is true when we observe a miss upon the - // first time we encounter the row key. The second arg is INSERTED when the + // We set is_complete to true when the referenced row-key of a get request + // hits the cache. If is_complete is true, we treat future accesses of this + // get request as hits. + // + // For each row key, it stores an enum. It is INSERTED when the // kv-pair has been inserted into the cache, ADMITTED if it should be inserted // but haven't been, NO_INSERT if it should not be inserted. // // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not // know its size. This may happen if the first access on the referenced key is // an index/filter block. - std::map>> - getid_getkeys_map_; + struct GetRequestStatus { + bool is_complete = false; + std::map row_key_status; + }; + + // A map stores get_id to a map of row keys. + std::map getid_status_map_; bool insert_blocks_upon_row_kvpair_miss_; }; diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc index dc3b8327e01..babdd431f5a 100644 --- a/utilities/simulator_cache/cache_simulator_test.cc +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -14,6 +14,7 @@ namespace rocksdb { namespace { const std::string kBlockKeyPrefix = "test-block-"; const std::string kRefKeyPrefix = "test-get-"; +const std::string kRefKeySequenceNumber = std::string(8, 'c'); const uint64_t kGetId = 1; const uint64_t kGetBlockId = 100; const uint64_t kCompactionBlockId = 1000; @@ -38,12 +39,12 @@ class CacheSimulatorTest : public testing::Test { record.cf_name = "test"; record.caller = TableReaderCaller::kUserGet; record.level = 6; - record.sst_fd_number = kGetBlockId; + record.sst_fd_number = 0; record.get_id = getid; record.is_cache_hit = Boolean::kFalse; record.no_insert = Boolean::kFalse; record.referenced_key = - kRefKeyPrefix + std::to_string(kGetId) + std::string(8, 'c'); + kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber; record.referenced_key_exist_in_block = Boolean::kTrue; record.referenced_data_size = 100; record.num_keys_in_block = 300; @@ -66,6 +67,29 @@ class CacheSimulatorTest : public testing::Test { return record; } + void AssertCache(std::shared_ptr sim_cache, + const MissRatioStats& miss_ratio_stats, + uint64_t expected_usage, uint64_t expected_num_accesses, + uint64_t expected_num_misses, + std::vector blocks, + std::vector keys) { + EXPECT_EQ(expected_usage, sim_cache->GetUsage()); + EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses()); + EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses()); + for (auto const& block : blocks) { + auto handle = sim_cache->Lookup(block); + EXPECT_NE(nullptr, handle); + sim_cache->Release(handle); + } + for (auto const& key : keys) { + std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber; + auto handle = + sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString() + "_0"); + EXPECT_NE(nullptr, handle); + sim_cache->Release(handle); + } + } + Env* env_; }; @@ -277,6 +301,127 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { } } +TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) { + BlockCacheTraceRecord get = GenerateGetRecord(kGetId); + get.block_size = 1; + get.referenced_data_size = 0; + get.access_timestamp = 0; + get.block_key = "1"; + get.get_id = 1; + get.get_from_user_specified_snapshot = Boolean::kFalse; + get.referenced_key = + kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber; + get.no_insert = Boolean::kFalse; + get.sst_fd_number = 0; + get.get_from_user_specified_snapshot = Boolean::kFalse; + + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true)); + // Expect a miss and does not insert the row key-value pair since it does not + // have size. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"}, + {}); + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.block_key = "2"; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2, + {"1", "2"}, {"1"}); + get.access_timestamp += 1; + get.block_key = "3"; + // K1 should not inserted again. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3, + {"1", "2", "3"}, {"1"}); + + // A second get request referencing the same key. + get.access_timestamp += 1; + get.get_id = 2; + get.block_key = "4"; + get.referenced_data_size = 0; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3, + {"1", "2", "3"}, {"1"}); + + // A third get request searches three files, three different keys. + // And the second key observes a hit. + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "3"; + get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber; + // K2 should observe a miss. Block 3 observes a hit. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3, + {"1", "2", "3"}, {"1", "2"}); + + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "4"; + get.referenced_data_size = 1; + get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber; + // K1 should observe a hit. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3, + {"1", "2", "3"}, {"1", "2"}); + + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "4"; + get.referenced_data_size = 1; + get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber; + // K3 should observe a miss. + // However, as the get already complete, we should not access k3 any more. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3, + {"1", "2", "3"}, {"1", "2"}); + + // A fourth get request searches one file and two blocks. One row key. + get.access_timestamp += 1; + get.get_id = 4; + get.block_key = "5"; + get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber; + get.referenced_data_size = 1; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4, + {"1", "2", "3", "5"}, {"1", "2", "4"}); + for (auto const& key : {"1", "2", "4"}) { + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0"); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + } + + // A bunch of insertions which evict cached row keys. + for (uint32_t i = 6; i < 100; i++) { + get.access_timestamp += 1; + get.get_id = 0; + get.block_key = std::to_string(i); + cache_simulator->Access(get); + } + + get.get_id = 4; + // A different block. + get.block_key = "100"; + // Same row key and should not be inserted again. + get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber; + get.referenced_data_size = 1; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {}, + {}); + for (auto const& key : {"1", "2", "4"}) { + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0"); + ASSERT_EQ(nullptr, handle); + } +} + TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) { uint64_t block_id = 100; BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); From 399f477818578c0d3e4614f6f148e8d7859121a2 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Mon, 29 Jul 2019 17:51:30 -0700 Subject: [PATCH 266/572] WriteUnPrepared: Use WriteUnpreparedTxnReadCallback for MultiGet (#5634) Summary: The `TransactionTest.MultiGetBatchedTest` were failing with unprepared batches because we were not using the correct callbacks. Override MultiGet to pass down the correct ReadCallback. A similar problem is also fixed in WritePrepared. This PR also fixes an issue similar to (https://github.com/facebook/rocksdb/pull/5147), but for MultiGet instead of Get. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5634 Differential Revision: D16552674 Pulled By: lth fbshipit-source-id: 736eaf8e919c6b13d5f5655b1c0d36b57ad04804 --- db/db_impl/db_impl.cc | 18 +++++++++++++++++ utilities/transactions/write_prepared_txn.cc | 19 ++++++++++++++++++ utilities/transactions/write_prepared_txn.h | 7 +++++++ .../transactions/write_unprepared_txn.cc | 20 +++++++++++++++++++ utilities/transactions/write_unprepared_txn.h | 7 +++++++ 5 files changed, 71 insertions(+) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 16a6d86a658..29b7f6f1470 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1857,6 +1857,24 @@ void DBImpl::MultiGetImpl( snapshot = last_seq_same_as_publish_seq_ ? versions_->LastSequence() : versions_->LastPublishedSequence(); + if (callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = callback->max_visible_seq(); + } } // For each of the given keys, apply the entire "get" process as follows: diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index f4c21d4769e..97bebac5d57 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -40,6 +40,25 @@ void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) { prepare_batch_cnt_ = 0; } +void WritePreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const bool backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + Status WritePreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h index 2cd729cd2c7..c574f62310f 100644 --- a/utilities/transactions/write_prepared_txn.h +++ b/utilities/transactions/write_prepared_txn.h @@ -53,6 +53,13 @@ class WritePreparedTxn : public PessimisticTransaction { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input = false) override; + // Note: The behavior is undefined in presence of interleaved writes to the // same transaction. // To make WAL commit markers visible, the snapshot will be diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index c677013aa03..d8c5eea5561 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -524,6 +524,26 @@ void WriteUnpreparedTxn::Clear() { TransactionBaseImpl::Clear(); } +void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const bool backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + Status WriteUnpreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index bc952544ab0..2c23155946a 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -146,6 +146,13 @@ class WriteUnpreparedTxn : public WritePreparedTxn { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input = false) override; + using Transaction::GetIterator; virtual Iterator* GetIterator(const ReadOptions& options) override; virtual Iterator* GetIterator(const ReadOptions& options, From 849a8c0ae0a0d72e0872f8c497626e1ff6dd8af9 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 30 Jul 2019 14:09:02 -0700 Subject: [PATCH 267/572] fix sign compare warnings (#5651) Summary: Fix -Wsign-compare warnings for gcc9. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5651 Test Plan: Tested with ubuntu19.10+gcc9 Differential Revision: D16567428 fbshipit-source-id: 730b2704d42ba0c4e4ea946a3199bbb34be4c25c --- env/io_posix.cc | 14 +++++++------- port/port_posix.cc | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/env/io_posix.cc b/env/io_posix.cc index 293516feee8..bcc9ab5272e 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -803,8 +803,8 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { #ifdef ROCKSDB_FALLOCATE_PRESENT Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { - assert(offset <= std::numeric_limits::max()); - assert(len <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); int alloc_status = 0; if (allow_fallocate_) { @@ -873,7 +873,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); } - assert(offset <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); const char* src = data.data(); size_t nbytes = data.size(); if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { @@ -1009,8 +1009,8 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { #ifdef ROCKSDB_FALLOCATE_PRESENT Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { - assert(offset <= std::numeric_limits::max()); - assert(len <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; @@ -1031,8 +1031,8 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { #ifdef ROCKSDB_RANGESYNC_PRESENT - assert(offset <= std::numeric_limits::max()); - assert(nbytes <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(nbytes <= static_cast(std::numeric_limits::max())); if (sync_file_range_supported_) { int ret; if (strict_bytes_per_sync_) { diff --git a/port/port_posix.cc b/port/port_posix.cc index f19d18ff0e6..167159d83c8 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -192,7 +192,8 @@ int GetMaxOpenFiles() { return -1; } // protect against overflow - if (no_files_limit.rlim_cur >= std::numeric_limits::max()) { + if (static_cast(no_files_limit.rlim_cur) >= + static_cast(std::numeric_limits::max())) { return std::numeric_limits::max(); } return static_cast(no_files_limit.rlim_cur); From 55f4f5486d4fc0657100d34a0ca0d4fa81a18350 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 30 Jul 2019 15:56:41 -0700 Subject: [PATCH 268/572] Update buckifier templates (#5647) Summary: Update buckifier templates in the scripts. Test plan (on devserver) ``` $python buckifier/buckify_rocksdb.py ``` Then ``` $git diff ``` Verify that generated TARGETS file is the same (except for indentation). Pull Request resolved: https://github.com/facebook/rocksdb/pull/5647 Differential Revision: D16555647 Pulled By: riversand963 fbshipit-source-id: 32574a4d0e820858eab2391304dd731141719bcd --- buckifier/targets_cfg.py | 82 ++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 79648bb6a6d..0ebd6d9427e 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -11,29 +11,9 @@ ROCKSDB_COMPILER_FLAGS = [ "-fno-builtin-memcmp", - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", - "-DROCKSDB_FALLOCATE_PRESENT", - "-DROCKSDB_MALLOC_USABLE_SIZE", - "-DROCKSDB_RANGESYNC_PRESENT", - "-DROCKSDB_SCHED_GETCPU_PRESENT", - "-DROCKSDB_SUPPORT_THREAD_LOCAL", - "-DOS_LINUX", - # Flags to enable libs we include - "-DSNAPPY", - "-DZLIB", - "-DBZIP2", - "-DLZ4", - "-DZSTD", - "-DZSTD_STATIC_LINKING_ONLY", - "-DGFLAGS=gflags", - "-DNUMA", - "-DTBB", # Needed to compile in fbcode "-Wno-expansion-to-defined", # Added missing flags from output of build_detect_platform - "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", - "-DROCKSDB_BACKTRACE", "-Wnarrowing", "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] @@ -46,11 +26,54 @@ ("lz4", None, "lz4"), ("zstd", None), ("tbb", None), - ("numa", None, "numa"), ("googletest", None, "gtest"), ] +ROCKSDB_OS_DEPS = [ + ( + "linux", + ["third-party//numa:numa"], + ), +] + +ROCKSDB_OS_PREPROCESSOR_FLAGS = [ + ( + "linux", + [ + "-DOS_LINUX", + "-DROCKSDB_FALLOCATE_PRESENT", + "-DROCKSDB_MALLOC_USABLE_SIZE", + "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", + "-DROCKSDB_RANGESYNC_PRESENT", + "-DROCKSDB_SCHED_GETCPU_PRESENT", + "-DHAVE_SSE42", + "-DNUMA", + ], + ), + ( + "macos", + ["-DOS_MACOSX"], + ), +] + ROCKSDB_PREPROCESSOR_FLAGS = [ + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DROCKSDB_SUPPORT_THREAD_LOCAL", + + # Flags to enable libs we include + "-DSNAPPY", + "-DZLIB", + "-DBZIP2", + "-DLZ4", + "-DZSTD", + "-DZSTD_STATIC_LINKING_ONLY", + "-DGFLAGS=gflags", + "-DTBB", + + # Added missing flags from output of build_detect_platform + "-DROCKSDB_BACKTRACE", + # Directories with files for #include "-I" + REPO_PATH + "include/", "-I" + REPO_PATH, @@ -58,7 +81,6 @@ ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { "x86_64": [ - "-DHAVE_SSE42", "-DHAVE_PCLMUL", ], } @@ -75,9 +97,15 @@ # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else []) - -ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else []) +ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( + "linux", + ["-DROCKSDB_JEMALLOC"], +)] if sanitizer == "" else []) + +ROCKSDB_OS_DEPS += ([( + "linux", + ["third-party//jemalloc:headers"], +)] if sanitizer == "" else []) """ @@ -88,6 +116,8 @@ {headers_attr_prefix}headers = {headers}, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [{deps}], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -127,6 +157,8 @@ rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, + rocksdb_os_deps = ROCKSDB_OS_DEPS, + rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, test_cc = test_cc, test_name = test_name, From 265db3ebb525460c78d9ee8dfb573905beb972eb Mon Sep 17 00:00:00 2001 From: Fosco Marotto Date: Tue, 30 Jul 2019 16:05:19 -0700 Subject: [PATCH 269/572] Update history and version for 6.4.0 (#5652) Summary: Master branch had been left at 6.2 and history of 6.3 and beyond were merged. Updated this to correct. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5652 Differential Revision: D16570498 Pulled By: gfosco fbshipit-source-id: 79f62ec570539a3e3d7d7c84a6cf7b722395fafe --- HISTORY.md | 51 ++++++++++++++++++++++++++------------- include/rocksdb/version.h | 2 +- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ace55cab404..ba96b0e4ba5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,46 +1,64 @@ # Rocksdb Change Log ## Unreleased + +## 6.4.0 (7/30/2019) ### Default Option Change * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0. * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. ### Public API Change -* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers. -* Partitions of partitioned indexes no longer affect the read amplification statistics. -* Due to the above refactoring, block cache eviction statistics for indexes, filters, and compression dictionaries are temporarily broken. We plan to reintroduce them in a later phase. * Errors related to the retrieval of the compression dictionary are now propagated to the user. -* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. -* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. -* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. -* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. -* Add C bindings for secondary instance, i.e. DBImplSecondary. * db_bench adds a "benchmark" stats_history, which prints out the whole stats history. -* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path * Overload GetAllKeyVersions() to support non-default column family. * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env. * Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors. +### New Features +* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. +* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. +* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. + +### Performance Improvements +* Reduce iterator key comparision for upper/lower bound check. +* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. +* The compression dictionary is no longer copied to a new object upon retrieval. + +### Bug Fixes +* Fix ingested file and directory not being fsync. +* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. + + +## 6.3.1 (7/24/2019) +### Bug Fixes +* Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails. + +## 6.3.0 (6/18/2019) +### Public API Change +* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Partitions of partitioned indexes no longer affect the read amplification statistics. +* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. +* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. +* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. +* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. +* Add C bindings for secondary instance, i.e. DBImplSecondary. +* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path + ### New Features * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. -* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. -* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. -* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases -* Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel -* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. -* The compression dictionary is no longer copied to a new object upon retrieval. ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. @@ -50,11 +68,10 @@ * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. -* Fix ingested file and directory not being fsync. -* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. * On DB open, delete WAL trash files left behind in wal_dir + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 7b7d7e86224..d86c5fc886c 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -5,7 +5,7 @@ #pragma once #define ROCKSDB_MAJOR 6 -#define ROCKSDB_MINOR 2 +#define ROCKSDB_MINOR 4 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with From b538e756c29eac69e5362d9dff52833200d3e242 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 30 Jul 2019 17:41:15 -0700 Subject: [PATCH 270/572] Split the recent block based table changes between 6.3 and 6.4 in HISTORY.md Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5653 Differential Revision: D16573445 Pulled By: ltamasi fbshipit-source-id: 19c639044fcfd43b5d5c627c8def33ff2dbb2af8 --- HISTORY.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ba96b0e4ba5..9e057250aee 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,7 +7,9 @@ * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. ### Public API Change -* Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers. +* Filter and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, filter and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. +* Due to the above refactoring, block cache eviction statistics for filter and compression dictionary blocks are temporarily broken. We plan to reintroduce them in a later phase. +* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. * Errors related to the retrieval of the compression dictionary are now propagated to the user. * db_bench adds a "benchmark" stats_history, which prints out the whole stats history. * Overload GetAllKeyVersions() to support non-default column family. @@ -38,12 +40,12 @@ ## 6.3.0 (6/18/2019) ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers. * Partitions of partitioned indexes no longer affect the read amplification statistics. -* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* Due to the above refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. -* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. * Add C bindings for secondary instance, i.e. DBImplSecondary. * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path From 4834dab578114b429163746acbcb93073bb5784f Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Wed, 31 Jul 2019 08:46:48 -0700 Subject: [PATCH 271/572] Improve CPU Efficiency of ApproximateSize (part 2) (#5609) Summary: In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609 Differential Revision: D16433481 Pulled By: elipoz fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b --- HISTORY.md | 1 + db/compaction/compaction_job.cc | 3 +- db/db_impl/db_impl.cc | 4 +- db/db_test.cc | 96 ++++++++++++++++--- db/version_set.cc | 161 +++++++++++++++++++++----------- db/version_set.h | 8 +- include/rocksdb/options.h | 10 ++ 7 files changed, 208 insertions(+), 75 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9e057250aee..201cef2b1b3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,7 @@ * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. +* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin. ### Performance Improvements * Reduce iterator key comparision for upper/lower bound check. diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index db701d19dad..663c8aa0a80 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() { // to the index block and may incur I/O cost in the process. Unlock db // mutex to reduce contention db_mutex_->Unlock(); - uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1, + uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a, + b, start_lvl, out_lvl + 1, TableReaderCaller::kCompaction); db_mutex_->Lock(); ranges.emplace_back(a, b, size); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 29b7f6f1470..81c44388bcf 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( - v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, - TableReaderCaller::kUserApproximateSize); + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } if (options.include_memtabtles) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; diff --git a/db/db_test.cc b/db/db_test.cc index f247ddb80fa..f53afa17d9d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { options.compression = kNoCompression; options.create_if_missing = true; DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); const int N = 128; Random rnd(301); @@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) { std::string start = Key(50); std::string end = Key(60); Range r(start, end); - uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES | - DB::SizeApproximationFlags::INCLUDE_MEMTABLES; - db_->GetApproximateSizes(&r, 1, &size, include_both); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); ASSERT_LT(size, 204800); // Zero if not including mem table @@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { @@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(100); end = Key(1020); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); options.max_write_buffer_number = 8; options.min_write_buffer_number_to_merge = 5; options.write_buffer_size = 1024 * N; // Not very large DestroyAndReopen(options); + default_cf = db_->DefaultColumnFamily(); int keys[N * 3]; for (int i = 0; i < N; i++) { @@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(100); end = Key(300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); start = Key(2100); end = Key(2300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); ASSERT_GT(size_with_mt, 6000); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_EQ(size_without_mt, 0); @@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_without_mt, 6000); + + // Check that include_memtabtles flag works as expected + size_approx_options.include_memtabtles = false; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, size_without_mt); + + // Check that files_size_error_margin works as expected, when the heuristic + // conditions are not met + start = Key(1); + end = Key(1000 + N - 2); + r = Range(start, end); + size_approx_options.files_size_error_margin = -1.0; // disabled + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + uint64_t size2; + size_approx_options.files_size_error_margin = 0.5; // enabled, but not used + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_EQ(size, size2); +} + +TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + Options options = CurrentOptions(); + options.write_buffer_size = 1024 * 1024; + options.compression = kNoCompression; + options.create_if_missing = true; + options.target_file_size_base = 1024 * 1024; + DestroyAndReopen(options); + const auto default_cf = db_->DefaultColumnFamily(); + + const int N = 64000; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files + Flush(); + // Compact the entire key space into the next level + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr); + + // Write more keys + for (int i = N; i < (N + N / 4); i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files again + Flush(); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); } TEST_F(DBTest, GetApproximateMemTableStats) { diff --git a/db/version_set.cc b/db/version_set.cc index 7d477a6806b..3a1f47790c5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. -uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + // pre-condition - assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); + assert(icmp.Compare(start, end) <= 0); - uint64_t size = 0; + uint64_t total_full_size = 0; const auto* vstorage = v->storage_info(); - end_level = end_level == -1 - ? vstorage->num_non_empty_levels() - : std::min(end_level, vstorage->num_non_empty_levels()); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); assert(start_level <= end_level); - for (int level = start_level; level < end_level; level++) { + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); - if (!files_brief.num_files) { + if (files_brief.num_files == 0) { // empty level, skip exploration continue; } - if (!level) { - // level 0 data is sorted order, handle the use case explicitly - size += ApproximateSizeLevel0(v, files_brief, start, end, caller); + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } continue; } assert(level > 0); assert(files_brief.num_files > 0); - // identify the file position for starting key - const uint64_t idx_start = FindFileInRange( - v->cfd_->internal_comparator(), files_brief, start, - /*start=*/0, static_cast(files_brief.num_files - 1)); - assert(idx_start < files_brief.num_files); - - // scan all files from the starting position until the ending position - // inferred from the sorted order - for (uint64_t i = idx_start; i < files_brief.num_files; i++) { - uint64_t val; - val = ApproximateSize(v, files_brief.files[i], end, caller); - if (!val) { - // the files after this will not have the range - break; - } + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); - size += val; + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); - if (i == idx_start) { - // subtract the bytes needed to be scanned to get to the starting - // key - val = ApproximateSize(v, files_brief.files[i], start, caller); - assert(size >= val); - size -= val; - } + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == + ApproximateSize(v, files_brief.files[i], end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); } } - return size; -} + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } -uint64_t VersionSet::ApproximateSizeLevel0(Version* v, - const LevelFilesBrief& files_brief, - const Slice& key_start, - const Slice& key_end, - TableReaderCaller caller) { - // level 0 files are not in sorted order, we need to iterate through - // the list to compute the total bytes that require scanning - uint64_t size = 0; - for (size_t i = 0; i < files_brief.num_files; i++) { - const uint64_t start = - ApproximateSize(v, files_brief.files[i], key_start, caller); - const uint64_t end = - ApproximateSize(v, files_brief.files[i], key_end, caller); - assert(end >= start); - size += end - start; + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files, at each level + for (const auto file_ptr : first_files) { + total_full_size += ApproximateSize(v, *file_ptr, end, caller); + // subtract the bytes needed to be scanned to get to the starting key + uint64_t val = ApproximateSize(v, *file_ptr, start, caller); + assert(total_full_size >= val); + total_full_size -= val; + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + total_full_size += ApproximateSize(v, *file_ptr, end, caller); + } } - return size; + + return total_full_size; } uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, @@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, TableReaderCaller caller) { // pre-condition assert(v); + const auto& icmp = v->cfd_->internal_comparator(); uint64_t result = 0; - if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) { + if (icmp.Compare(f.largest_key, key) <= 0) { // Entire file is before "key", so just add the file size result = f.fd.GetFileSize(); - } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) { + } else if (icmp.Compare(f.smallest_key, key) > 0) { // Entire file is after "key", so ignore result = 0; } else { @@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, TableCache* table_cache = v->cfd_->table_cache(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(), + key, f.file_metadata->fd, caller, icmp, v->GetMutableCFOptions().prefix_extractor.get()); } } diff --git a/db/version_set.h b/db/version_set.h index ee94f5966df..391bb902c4b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -983,7 +983,8 @@ class VersionSet { // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, + uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller); @@ -1033,11 +1034,6 @@ class VersionSet { } }; - // ApproximateSize helper - uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, - const Slice& start, const Slice& end, - TableReaderCaller caller); - uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 5ae010b8f52..bda44d4417c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1514,6 +1514,16 @@ struct SizeApproximationOptions { // Defines whether the returned size should include data serialized to disk. // If set to false, include_memtabtles must be true. bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; }; } // namespace rocksdb From d599135a0332a8aa08abe56d08027f61331ef9e3 Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Wed, 31 Jul 2019 10:41:05 -0700 Subject: [PATCH 272/572] WriteUnPrepared: use WriteUnpreparedTxnReadCallback for ValidateSnapshot (#5657) Summary: In DeferSnapshotSavePointTest, writes were failing with snapshot validation error because the key with the latest sequence number was an unprepared key from the current transaction. Fix this by passing down the correct read callback. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5657 Differential Revision: D16582466 Pulled By: lth fbshipit-source-id: 11645dac0e7c1374d917ef5fdf757d13c1d1108d --- .../transactions/write_unprepared_txn.cc | 33 +++++++++++++++++++ utilities/transactions/write_unprepared_txn.h | 4 +++ 2 files changed, 37 insertions(+) diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index d8c5eea5561..c5f4db5bd56 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -574,6 +574,39 @@ Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options, return write_batch_.NewIteratorWithBase(column_family, db_iter); } +Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic. + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check( + snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WriteUnpreparedTxnReadCallback snap_checker(wupt_db_, snap_seq, + min_uncommitted, unprep_seqs_); + return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(), + snap_seq, false /* cache_only */, + &snap_checker, min_uncommitted); +} + const std::map& WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { return unprep_seqs_; diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 2c23155946a..77c18033898 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -158,6 +158,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { virtual Iterator* GetIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) override; + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + private: friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; From f622ca2c7c12ff13b24083b57d1279aaa38a2ccd Mon Sep 17 00:00:00 2001 From: Manuel Ung Date: Wed, 31 Jul 2019 13:36:22 -0700 Subject: [PATCH 273/572] WriteUnPrepared: savepoint support (#5627) Summary: Add savepoint support when the current transaction has flushed unprepared batches. Rolling back to savepoint is similar to rolling back a transaction. It requires the set of keys that have changed since the savepoint, re-reading the keys at the snapshot at that savepoint, and the restoring the old keys by writing out another unprepared batch. For this strategy to work though, we must be capable of reading keys at a savepoint. This does not work if keys were written out using the same sequence number before and after a savepoint. Therefore, when we flush out unprepared batches, we must split the batch by savepoint if any savepoints exist. eg. If we have the following: ``` Put(A) Put(B) Put(C) SetSavePoint() Put(D) Put(E) SetSavePoint() Put(F) ``` Then we will write out 3 separate unprepared batches: ``` Put(A) 1 Put(B) 1 Put(C) 1 Put(D) 2 Put(E) 2 Put(F) 3 ``` This is so that when we rollback to eg. the first savepoint, we can just read keys at snapshot_seq = 1. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5627 Differential Revision: D16584130 Pulled By: lth fbshipit-source-id: 6d100dd548fb20c4b76661bd0f8a2647e64477fa --- db/write_batch.cc | 48 ++-- db/write_batch_internal.h | 4 + .../utilities/write_batch_with_index.h | 2 + include/rocksdb/write_batch.h | 2 +- utilities/transactions/transaction_base.cc | 4 +- utilities/transactions/transaction_base.h | 20 +- .../transactions/write_unprepared_txn.cc | 258 +++++++++++++++++- utilities/transactions/write_unprepared_txn.h | 60 +++- .../transactions/write_unprepared_txn_db.cc | 4 +- .../write_batch_with_index.cc | 5 + 10 files changed, 378 insertions(+), 29 deletions(-) diff --git a/db/write_batch.cc b/db/write_batch.cc index 2c2d81e87f6..8a896644fc2 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -511,12 +511,25 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, } Status WriteBatch::Iterate(Handler* handler) const { - Slice input(rep_); - if (input.size() < WriteBatchInternal::kHeader) { + if (rep_.size() < WriteBatchInternal::kHeader) { return Status::Corruption("malformed WriteBatch (too small)"); } - input.remove_prefix(WriteBatchInternal::kHeader); + return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader, + rep_.size()); +} + +Status WriteBatchInternal::Iterate(const WriteBatch* wb, + WriteBatch::Handler* handler, size_t begin, + size_t end) { + if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) { + return Status::Corruption("Invalid start/end bounds for Iterate"); + } + assert(begin <= end); + Slice input(wb->rep_.data() + begin, static_cast(end - begin)); + bool whole_batch = + (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); + Slice key, value, blob, xid; // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as // the batch boundary symbols otherwise we would mis-count the number of @@ -547,7 +560,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } } else { assert(s.IsTryAgain()); - assert(!last_was_try_again); // to detect infinite loop bugs + assert(!last_was_try_again); // to detect infinite loop bugs if (UNLIKELY(last_was_try_again)) { return Status::Corruption( "two consecutive TryAgain in WriteBatch handler; this is either a " @@ -560,7 +573,7 @@ Status WriteBatch::Iterate(Handler* handler) const { switch (tag) { case kTypeColumnFamilyValue: case kTypeValue: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); s = handler->PutCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -570,7 +583,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyDeletion: case kTypeDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); s = handler->DeleteCF(column_family, key); if (LIKELY(s.ok())) { @@ -580,7 +593,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilySingleDeletion: case kTypeSingleDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); s = handler->SingleDeleteCF(column_family, key); if (LIKELY(s.ok())) { @@ -590,7 +603,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyRangeDeletion: case kTypeRangeDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE)); s = handler->DeleteRangeCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -600,7 +613,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyMerge: case kTypeMerge: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); s = handler->MergeCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -610,7 +623,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyBlobIndex: case kTypeBlobIndex: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX)); s = handler->PutBlobIndexCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -623,7 +636,7 @@ Status WriteBatch::Iterate(Handler* handler) const { empty_batch = false; break; case kTypeBeginPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); handler->MarkBeginPrepare(); empty_batch = false; @@ -642,7 +655,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeBeginPersistedPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); handler->MarkBeginPrepare(); empty_batch = false; @@ -655,7 +668,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeBeginUnprepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); handler->MarkBeginPrepare(true /* unprepared */); empty_batch = false; @@ -674,19 +687,19 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeEndPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); handler->MarkEndPrepare(xid); empty_batch = true; break; case kTypeCommitXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); handler->MarkCommit(xid); empty_batch = true; break; case kTypeRollbackXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); handler->MarkRollback(xid); empty_batch = true; @@ -702,7 +715,8 @@ Status WriteBatch::Iterate(Handler* handler) const { if (!s.ok()) { return s; } - if (handler_continue && found != WriteBatchInternal::Count(this)) { + if (handler_continue && whole_batch && + found != WriteBatchInternal::Count(wb)) { return Status::Corruption("WriteBatch has wrong count"); } else { return Status::OK(); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index bae62bf0317..67136a84716 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -192,6 +192,10 @@ class WriteBatchInternal { // leftByteSize and a WriteBatch with ByteSize rightByteSize static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); + // Iterate over [begin, end) range of a write batch + static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, + size_t begin, size_t end); + // This write batch includes the latest state that should be persisted. Such // state meant to be used only during recovery. static void SetAsLastestPersistentState(WriteBatch* b); diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 34e6c46895c..586088d7519 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -100,6 +100,8 @@ class WriteBatchWithIndex : public WriteBatchBase { size_t max_bytes = 0); ~WriteBatchWithIndex() override; + WriteBatchWithIndex(WriteBatchWithIndex&&); + WriteBatchWithIndex& operator=(WriteBatchWithIndex&&); using WriteBatchBase::Put; Status Put(ColumnFamilyHandle* column_family, const Slice& key, diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 393c5d9c6ab..b6b7c8bb820 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -271,7 +271,7 @@ class WriteBatch : public WriteBatchBase { virtual bool Continue(); protected: - friend class WriteBatch; + friend class WriteBatchInternal; virtual bool WriteAfterCommit() const { return true; } virtual bool WriteBeforePrepare() const { return false; } }; diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index bf59a1c4069..30861f09148 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -30,7 +30,7 @@ TransactionBaseImpl::TransactionBaseImpl(DB* db, assert(dynamic_cast(db_) != nullptr); log_number_ = 0; if (dbimpl_->allow_2pc()) { - WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + InitWriteBatch(); } } @@ -49,7 +49,7 @@ void TransactionBaseImpl::Clear() { num_merges_ = 0; if (dbimpl_->allow_2pc()) { - WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + InitWriteBatch(); } } diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 657e9c59656..72fa9d26af4 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -11,6 +11,7 @@ #include #include +#include "db/write_batch_internal.h" #include "rocksdb/db.h" #include "rocksdb/slice.h" #include "rocksdb/snapshot.h" @@ -273,6 +274,15 @@ class TransactionBaseImpl : public Transaction { // Sets a snapshot if SetSnapshotOnNextOperation() has been called. void SetSnapshotIfNeeded(); + // Initialize write_batch_ for 2PC by inserting Noop. + inline void InitWriteBatch(bool clear = false) { + if (clear) { + write_batch_.Clear(); + } + assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader); + WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + } + DB* db_; DBImpl* dbimpl_; @@ -325,16 +335,18 @@ class TransactionBaseImpl : public Transaction { // Optimistic Transactions will wait till commit time to do conflict checking. TransactionKeyMap tracked_keys_; + // Stack of the Snapshot saved at each save point. Saved snapshots may be + // nullptr if there was no snapshot at the time SetSavePoint() was called. + std::unique_ptr>> + save_points_; + private: friend class WritePreparedTxn; // Extra data to be persisted with the commit. Note this is only used when // prepare phase is not skipped. WriteBatch commit_time_batch_; - // Stack of the Snapshot saved at each save point. Saved snapshots may be - // nullptr if there was no snapshot at the time SetSavePoint() was called. - std::unique_ptr>> save_points_; - // If true, future Put/Merge/Deletes will be indexed in the // WriteBatchWithIndex. // If false, future Put/Merge/Deletes will be inserted directly into the diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index c5f4db5bd56..993c3b8b60c 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -78,6 +78,8 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { } unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); recovered_txn_ = false; largest_validated_seq_ = 0; } @@ -236,6 +238,20 @@ Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { } Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { + // If the current write batch contains savepoints, then some special handling + // is required so that RollbackToSavepoint can work. + // + // RollbackToSavepoint is not supported after Prepare() is called, so only do + // this for unprepared batches. + if (!prepared && unflushed_save_points_ != nullptr && + !unflushed_save_points_->empty()) { + return FlushWriteBatchWithSavePointToDB(); + } + + return FlushWriteBatchToDBInternal(prepared); +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) { if (name_.empty()) { return Status::InvalidArgument("Cannot write to DB without SetName."); } @@ -285,13 +301,118 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { // Reset transaction state. if (!prepared) { prepare_batch_cnt_ = 0; - write_batch_.Clear(); - WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); } return s; } +Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() { + assert(unflushed_save_points_ != nullptr && + unflushed_save_points_->size() > 0); + assert(save_points_ != nullptr && save_points_->size() > 0); + assert(save_points_->size() >= unflushed_save_points_->size()); + + // Handler class for creating an unprepared batch from a savepoint. + struct SavePointBatchHandler : public WriteBatch::Handler { + WriteBatchWithIndex* wb_; + const std::map& handles_; + + SavePointBatchHandler( + WriteBatchWithIndex* wb, + const std::map& handles) + : wb_(wb), handles_(handles) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Put(handles_.at(cf), key, value); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return wb_->Delete(handles_.at(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return wb_->SingleDelete(handles_.at(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Merge(handles_.at(cf), key, value); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + // The comparator of the default cf is passed in, similar to the + // initialization of TransactionBaseImpl::write_batch_. This comparator is + // only used if the write batch encounters an invalid cf id, and falls back to + // this comparator. + WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0, + true, 0); + // Swap with write_batch_ so that wb contains the complete write batch. The + // actual write batch that will be flushed to DB will be built in + // write_batch_, and will be read by FlushWriteBatchToDBInternal. + std::swap(wb, write_batch_); + TransactionBaseImpl::InitWriteBatch(); + + size_t prev_boundary = WriteBatchInternal::kHeader; + const bool kPrepared = true; + for (size_t i = 0; i < unflushed_save_points_->size(); i++) { + SavePointBatchHandler sp_handler(&write_batch_, + *wupt_db_->GetCFHandleMap().get()); + size_t curr_boundary = (*unflushed_save_points_)[i]; + + // Construct the partial write batch up to the savepoint. + // + // Theoretically, a memcpy between the write batches should be sufficient + // since the rewriting into the batch should produce the exact same byte + // representation. Rebuilding the WriteBatchWithIndex index is still + // necessary though, and would imply doing two passes over the batch though. + Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler, + prev_boundary, curr_boundary); + if (!s.ok()) { + return s; + } + + // Flush the write batch. + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + + if (flushed_save_points_ == nullptr) { + flushed_save_points_.reset( + new autovector()); + } + flushed_save_points_->emplace_back( + unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot())); + + prev_boundary = curr_boundary; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + unflushed_save_points_->clear(); + return Status::OK(); +} + Status WriteUnpreparedTxn::PrepareInternal() { const bool kPrepared = true; return FlushWriteBatchToDB(kPrepared); @@ -379,6 +500,8 @@ Status WriteUnpreparedTxn::CommitInternal() { wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); } unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); return s; } // else do the 2nd write to publish seq @@ -410,6 +533,8 @@ Status WriteUnpreparedTxn::CommitInternal() { wpt_db_->RemovePrepared(seq.first, seq.second); } unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); return s; } @@ -488,6 +613,8 @@ Status WriteUnpreparedTxn::RollbackInternal() { wpt_db_->RemovePrepared(seq.first, seq.second); } unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); return s; } // else do the 2nd write for commit uint64_t& prepare_seq = seq_used; @@ -514,6 +641,8 @@ Status WriteUnpreparedTxn::RollbackInternal() { } unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); return s; } @@ -524,6 +653,131 @@ void WriteUnpreparedTxn::Clear() { TransactionBaseImpl::Clear(); } +void WriteUnpreparedTxn::SetSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + PessimisticTransaction::SetSavePoint(); + if (unflushed_save_points_ == nullptr) { + unflushed_save_points_.reset(new autovector()); + } + unflushed_save_points_->push_back(write_batch_.GetDataSize()); +} + +Status WriteUnpreparedTxn::RollbackToSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::RollbackToSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + return RollbackToSavePointInternal(); + } + + return Status::NotFound(); +} + +Status WriteUnpreparedTxn::RollbackToSavePointInternal() { + Status s; + + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + + assert(flushed_save_points_->size() > 0); + WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back(); + + assert(top.unprep_seqs_.size() > 0); + assert(save_points_ != nullptr && save_points_->size() > 0); + const TransactionKeyMap& tracked_keys = save_points_->top().new_keys_; + + // TODO(lth): Reduce duplicate code with RollbackInternal logic. + ReadOptions roptions; + roptions.snapshot = top.snapshot_->snapshot(); + SequenceNumber min_uncommitted = + static_cast_with_check( + roptions.snapshot) + ->min_uncommitted_; + SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber(); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + top.unprep_seqs_); + const auto& cf_map = *wupt_db_->GetCFHandleMap(); + for (const auto& cfkey : tracked_keys) { + const auto cfid = cfkey.first; + const auto& keys = cfkey.second; + + for (const auto& pair : keys) { + const auto& key = pair.first; + const auto& cf_handle = cf_map.at(cfid); + PinnableSlice pinnable_val; + bool not_used; + s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used, + &callback); + + if (s.ok()) { + s = write_batch_.Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + s = write_batch_.Delete(cf_handle, key); + assert(s.ok()); + } else { + return s; + } + } + } + + const bool kPrepared = true; + s = FlushWriteBatchToDBInternal(!kPrepared); + assert(s.ok()); + if (!s.ok()) { + return s; + } + + // PessimisticTransaction::RollbackToSavePoint will call also call + // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has + // no savepoints because this savepoint has already been flushed. Work around + // this by setting a fake savepoint. + write_batch_.SetSavePoint(); + s = PessimisticTransaction::RollbackToSavePoint(); + assert(s.ok()); + if (!s.ok()) { + return s; + } + + flushed_save_points_->pop_back(); + return s; +} + +Status WriteUnpreparedTxn::PopSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on + // write_batch_. However, write_batch_ is empty and has no savepoints + // because this savepoint has already been flushed. Work around this by + // setting a fake savepoint. + write_batch_.SetSavePoint(); + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + flushed_save_points_->pop_back(); + return s; + } + + return Status::NotFound(); +} + void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 77c18033898..774d90e8d37 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -73,7 +73,6 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { wup_snapshot_ = seq; } - private: static SequenceNumber CalcMaxVisibleSeq( const std::map& unprep_seqs, SequenceNumber snapshot_seq) { @@ -84,6 +83,8 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { } return std::max(max_unprepared, snapshot_seq); } + + private: WritePreparedTxnDB* db_; const std::map& unprep_seqs_; SequenceNumber wup_snapshot_; @@ -139,6 +140,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { void Clear() override; + void SetSavePoint() override; + Status RollbackToSavePoint() override; + Status PopSavePoint() override; + // Get and GetIterator needs to be overridden so that a ReadCallback to // handle read-your-own-write is used. using Transaction::Get; @@ -172,6 +177,9 @@ class WriteUnpreparedTxn : public WritePreparedTxn { Status MaybeFlushWriteBatchToDB(); Status FlushWriteBatchToDB(bool prepared); + Status FlushWriteBatchToDBInternal(bool prepared); + Status FlushWriteBatchWithSavePointToDB(); + Status RollbackToSavePointInternal(); Status HandleWrite(std::function do_write); // For write unprepared, we check on every writebatch append to see if @@ -210,6 +218,56 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // but in some cases, we should be able to restore the previously largest // value when calling RollbackToSavepoint. SequenceNumber largest_validated_seq_; + + struct SavePoint { + // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is + // used during RollbackToSavepoint to determine visibility when restoring + // old values. + // + // TODO(lth): Since all unprep_seqs_ sets further down the stack must be + // subsets, this can potentially be deduplicated by just storing set + // difference. Investigate if this is worth it. + std::map unprep_seqs_; + + // This snapshot will be used to read keys at this savepoint if we call + // RollbackToSavePoint. + std::unique_ptr snapshot_; + + SavePoint(const std::map& seqs, + ManagedSnapshot* snapshot) + : unprep_seqs_(seqs), snapshot_(snapshot){}; + }; + + // We have 3 data structures holding savepoint information: + // 1. TransactionBaseImpl::save_points_ + // 2. WriteUnpreparedTxn::flushed_save_points_ + // 3. WriteUnpreparecTxn::unflushed_save_points_ + // + // TransactionBaseImpl::save_points_ holds information about all write + // batches, including the current in-memory write_batch_, or unprepared + // batches that have been written out. Its responsibility is just to track + // which keys have been modified in every savepoint. + // + // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints + // set on unprepared batches that have already flushed. It holds the snapshot + // and unprep_seqs at that savepoint, so that the rollback process can + // determine which keys were visible at that point in time. + // + // WriteUnpreparecTxn::unflushed_save_points_ holds information about + // savepoints on the current in-memory write_batch_. It simply records the + // size of the write batch at every savepoint. + // + // TODO(lth): Remove the redundancy between save_point_boundaries_ and + // write_batch_.save_points_. + // + // Based on this information, here are some invariants: + // size(unflushed_save_points_) = size(write_batch_.save_points_) + // size(flushed_save_points_) + size(unflushed_save_points_) + // = size(save_points_) + // + std::unique_ptr> + flushed_save_points_; + std::unique_ptr> unflushed_save_points_; }; } // namespace rocksdb diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 875d5416763..4381619e782 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -279,8 +279,8 @@ Status WriteUnpreparedTxnDB::Initialize( } } - wupt->write_batch_.Clear(); - WriteBatchInternal::InsertNoop(wupt->write_batch_.GetWriteBatch()); + const bool kClear = true; + wupt->InitWriteBatch(kClear); real_trx->SetState(Transaction::PREPARED); if (!s.ok()) { diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index cf17abf22e9..3ffa2e0c62a 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -627,6 +627,11 @@ WriteBatchWithIndex::WriteBatchWithIndex( WriteBatchWithIndex::~WriteBatchWithIndex() {} +WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default; + +WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) = + default; + WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; } size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; } From 1dfc5eaab03f998ab13a6953b53e41cdfd2c8237 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Wed, 31 Jul 2019 15:16:01 -0700 Subject: [PATCH 274/572] Test the various configurations in parallel in MergeOperatorPinningTest (#5659) Summary: MergeOperatorPinningTest.Randomized frequently times out under TSAN because it tests ~40 option configurations sequentially in a loop. The patch parallelizes the tests of the various configurations to make the test complete faster. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5659 Test Plan: Tested using buck test mode/dev-tsan ... Differential Revision: D16587518 Pulled By: ltamasi fbshipit-source-id: 65bd25c0ad9a23587fed5592e69c1a0097fa27f6 --- db/db_merge_operator_test.cc | 142 ++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 62 deletions(-) diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 2b5e4a445ea..31bd2e491b1 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -275,68 +275,6 @@ TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { VerifyDBFromMap(true_data); } -TEST_P(MergeOperatorPinningTest, Randomized) { - do { - Options options = CurrentOptions(); - options.merge_operator = MergeOperators::CreateMaxOperator(); - BlockBasedTableOptions table_options; - table_options.no_block_cache = disable_block_cache_; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - Random rnd(301); - std::map true_data; - - const int kTotalMerges = 5000; - // Every key gets ~10 operands - const int kKeyRange = kTotalMerges / 10; - const int kOperandSize = 20; - const int kNumPutBefore = kKeyRange / 10; // 10% value - const int kNumPutAfter = kKeyRange / 10; // 10% overwrite - const int kNumDelete = kKeyRange / 10; // 10% delete - - // kNumPutBefore keys will have base values - for (int i = 0; i < kNumPutBefore; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); - - true_data[key] = value; - } - - // Do kTotalMerges merges - for (int i = 0; i < kTotalMerges; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Merge(WriteOptions(), key, value)); - - if (true_data[key] < value) { - true_data[key] = value; - } - } - - // Overwrite random kNumPutAfter keys - for (int i = 0; i < kNumPutAfter; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); - - true_data[key] = value; - } - - // Delete random kNumDelete keys - for (int i = 0; i < kNumDelete; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - ASSERT_OK(db_->Delete(WriteOptions(), key)); - - true_data.erase(key); - } - - VerifyDBFromMap(true_data); - - } while (ChangeOptions(kSkipMergePut)); -} - class MergeOperatorHook : public MergeOperator { public: explicit MergeOperatorHook(std::shared_ptr _merge_op) @@ -637,6 +575,86 @@ TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) { db_->ReleaseSnapshot(snapshot2); } +class PerConfigMergeOperatorPinningTest + : public DBMergeOperatorTest, + public testing::WithParamInterface> { + public: + PerConfigMergeOperatorPinningTest() { + std::tie(disable_block_cache_, option_config_) = GetParam(); + } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P( + MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest, + ::testing::Combine(::testing::Bool(), + ::testing::Range(static_cast(DBTestBase::kDefault), + static_cast(DBTestBase::kEnd)))); + +TEST_P(PerConfigMergeOperatorPinningTest, Randomized) { + if (ShouldSkipOptions(option_config_, kSkipMergePut)) { + return; + } + + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + + const int kTotalMerges = 5000; + // Every key gets ~10 operands + const int kKeyRange = kTotalMerges / 10; + const int kOperandSize = 20; + const int kNumPutBefore = kKeyRange / 10; // 10% value + const int kNumPutAfter = kKeyRange / 10; // 10% overwrite + const int kNumDelete = kKeyRange / 10; // 10% delete + + // kNumPutBefore keys will have base values + for (int i = 0; i < kNumPutBefore; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Do kTotalMerges merges + for (int i = 0; i < kTotalMerges; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + + if (true_data[key] < value) { + true_data[key] = value; + } + } + + // Overwrite random kNumPutAfter keys + for (int i = 0; i < kNumPutAfter; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Delete random kNumDelete keys + for (int i = 0; i < kNumDelete; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + ASSERT_OK(db_->Delete(WriteOptions(), key)); + + true_data.erase(key); + } + + VerifyDBFromMap(true_data); +} + } // namespace rocksdb int main(int argc, char** argv) { From d1c9ede1956a29472fbe7202cd3e8ee7aefa7c31 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Thu, 1 Aug 2019 15:45:19 -0700 Subject: [PATCH 275/572] Fix duplicated file names in PurgeObsoleteFiles (#5603) Summary: Currently in `DBImpl::PurgeObsoleteFiles`, the list of candidate files is create through a combination of calling LogFileName using `log_delete_files` and `full_scan_candidate_files`. In full_scan_candidate_files, the filenames look like this {file_name = "074715.log", file_path = "/txlogs/3306"}, but LogFileName produces filenames like this that prepends a slash: {file_name = "/074715.log", file_path = "/txlogs/3306"}, This confuses the dedup step here: https://github.com/facebook/rocksdb/blob/bb4178066dc4f18b9b7f1d371e641db027b3edbe/db/db_impl/db_impl_files.cc#L339-L345 Because duplicates still exist, DeleteFile is called on the same file twice, and hits an error on the second try. Error message: Failed to mark /txlogs/3302/764418.log as trash. The root cause is the use of `kDumbDbName` when generating file names, it creates file names like /074715.log. This PR removes the use of `kDumbDbName` and create paths without leading '/' when dbname can be ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5603 Test Plan: make check Differential Revision: D16413203 Pulled By: miasantreble fbshipit-source-id: 6ba8288382c55f7d5e3892d722fc94b57d2e4491 --- db/db_impl/db_impl_files.cc | 5 ++--- file/filename.cc | 21 +++++++++++++++++---- file/filename.h | 4 ++++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 7afe3955e5b..e3b2f576523 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -316,10 +316,9 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { candidate_files.size() + state.sst_delete_files.size() + state.log_delete_files.size() + state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. - const char* kDumbDbName = ""; for (auto& file : state.sst_delete_files) { candidate_files.emplace_back( - MakeTableFileName(kDumbDbName, file.metadata->fd.GetNumber()), + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); if (file.metadata->table_reader_handle) { table_cache_->Release(file.metadata->table_reader_handle); @@ -329,7 +328,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { for (auto file_num : state.log_delete_files) { if (file_num > 0) { - candidate_files.emplace_back(LogFileName(kDumbDbName, file_num), + candidate_files.emplace_back(LogFileName(file_num), immutable_db_options_.wal_dir); } } diff --git a/file/filename.cc b/file/filename.cc index d4f7dd9ec7c..65ec3314995 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -57,13 +57,17 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { return write_idx; } -static std::string MakeFileName(const std::string& name, uint64_t number, - const char* suffix) { +static std::string MakeFileName(uint64_t number, const char* suffix) { char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", + snprintf(buf, sizeof(buf), "%06llu.%s", static_cast(number), suffix); - return name + buf; + return buf; +} + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + return name + "/" + MakeFileName(number, suffix); } std::string LogFileName(const std::string& name, uint64_t number) { @@ -71,6 +75,11 @@ std::string LogFileName(const std::string& name, uint64_t number) { return MakeFileName(name, number, "log"); } +std::string LogFileName(uint64_t number) { + assert(number > 0); + return MakeFileName(number, "log"); +} + std::string BlobFileName(const std::string& blobdirname, uint64_t number) { assert(number > 0); return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str()); @@ -95,6 +104,10 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) { return MakeFileName(path, number, kRocksDbTFileExt.c_str()); } +std::string MakeTableFileName(uint64_t number) { + return MakeFileName(number, kRocksDbTFileExt.c_str()); +} + std::string Rocks2LevelTableFileName(const std::string& fullname) { assert(fullname.size() > kRocksDbTFileExt.size() + 1); if (fullname.size() <= kRocksDbTFileExt.size() + 1) { diff --git a/file/filename.h b/file/filename.h index db06f4664e2..91b905f07ab 100644 --- a/file/filename.h +++ b/file/filename.h @@ -47,6 +47,8 @@ enum FileType { // "dbname". extern std::string LogFileName(const std::string& dbname, uint64_t number); +extern std::string LogFileName(uint64_t number); + extern std::string BlobFileName(const std::string& bdirname, uint64_t number); extern std::string BlobFileName(const std::string& dbname, @@ -63,6 +65,8 @@ extern std::string ArchivedLogFileName(const std::string& dbname, extern std::string MakeTableFileName(const std::string& name, uint64_t number); +extern std::string MakeTableFileName(uint64_t number); + // Return the name of sstable with LevelDB suffix // created from RocksDB sstable suffixed name extern std::string Rocks2LevelTableFileName(const std::string& fullname); From 30edf1874c11762a6cacf4434112ce34d13100d3 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 2 Aug 2019 10:40:32 -0700 Subject: [PATCH 276/572] Change buckifier to support parameterized dependencies (#5648) Summary: Users may desire to specify extra dependencies via buck. This PR allows users to pass additional dependencies as a JSON object so that the buckifier script can generate TARGETS file with desired extra dependencies. Test plan (on dev server) ``` $python buckifier/buckify_rocksdb.py '{"fake": {"extra_deps": [":test_dep", "//fakes/module:mock1"], "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]}}' Generating TARGETS Extra dependencies: {'': {'extra_compiler_flags': [], 'extra_deps': []}, 'test_dep1': {'extra_compiler_flags': ['-O2', '-DROCKSDB_LITE'], 'extra_deps': [':fake', '//dep1/mock']}} Generated TARGETS Summary: - 5 libs - 0 binarys - 296 tests ``` Verify the TARGETS file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5648 Differential Revision: D16565043 Pulled By: riversand963 fbshipit-source-id: a6ef02274174fcf159692d7b846e828454d01e89 --- TARGETS | 302 ++++++++++++++++++++++++++++++++++- buckifier/buckify_rocksdb.py | 97 ++++++++--- buckifier/targets_builder.py | 11 +- buckifier/targets_cfg.py | 8 +- defs.bzl | 8 +- 5 files changed, 398 insertions(+), 28 deletions(-) diff --git a/TARGETS b/TARGETS index 884d69b14bc..25d7ff66759 100644 --- a/TARGETS +++ b/TARGETS @@ -396,747 +396,1043 @@ cpp_library( external_deps = ROCKSDB_EXTERNAL_DEPS, ) -# [test_name, test_src, test_type] +# [test_name, test_src, test_type, extra_deps, extra_compiler_flags] ROCKS_TESTS = [ [ "arena_test", "memory/arena_test.cc", "serial", + [], + [], ], [ "auto_roll_logger_test", "logging/auto_roll_logger_test.cc", "serial", + [], + [], ], [ "autovector_test", "util/autovector_test.cc", "serial", + [], + [], ], [ "backupable_db_test", "utilities/backupable/backupable_db_test.cc", "parallel", + [], + [], ], [ "blob_db_test", "utilities/blob_db/blob_db_test.cc", "serial", + [], + [], ], [ "block_based_filter_block_test", "table/block_based/block_based_filter_block_test.cc", "serial", + [], + [], ], [ "block_cache_trace_analyzer_test", "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc", "serial", + [], + [], ], [ "block_cache_tracer_test", "trace_replay/block_cache_tracer_test.cc", "serial", + [], + [], ], [ "block_test", "table/block_based/block_test.cc", "serial", + [], + [], ], [ "bloom_test", "util/bloom_test.cc", "serial", + [], + [], ], [ "c_test", "db/c_test.c", "serial", + [], + [], ], [ "cache_simulator_test", "utilities/simulator_cache/cache_simulator_test.cc", "serial", + [], + [], ], [ "cache_test", "cache/cache_test.cc", "serial", + [], + [], ], [ "cassandra_format_test", "utilities/cassandra/cassandra_format_test.cc", "serial", + [], + [], ], [ "cassandra_functional_test", "utilities/cassandra/cassandra_functional_test.cc", "serial", + [], + [], ], [ "cassandra_row_merge_test", "utilities/cassandra/cassandra_row_merge_test.cc", "serial", + [], + [], ], [ "cassandra_serialize_test", "utilities/cassandra/cassandra_serialize_test.cc", "serial", + [], + [], ], [ "checkpoint_test", "utilities/checkpoint/checkpoint_test.cc", "serial", + [], + [], ], [ "cleanable_test", "table/cleanable_test.cc", "serial", + [], + [], ], [ "coding_test", "util/coding_test.cc", "serial", + [], + [], ], [ "column_family_test", "db/column_family_test.cc", "serial", + [], + [], ], [ "compact_files_test", "db/compact_files_test.cc", "serial", + [], + [], ], [ "compact_on_deletion_collector_test", "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc", "serial", + [], + [], ], [ "compaction_iterator_test", "db/compaction/compaction_iterator_test.cc", "serial", + [], + [], ], [ "compaction_job_stats_test", "db/compaction/compaction_job_stats_test.cc", "serial", + [], + [], ], [ "compaction_job_test", "db/compaction/compaction_job_test.cc", "serial", + [], + [], ], [ "compaction_picker_test", "db/compaction/compaction_picker_test.cc", "serial", + [], + [], ], [ "comparator_db_test", "db/comparator_db_test.cc", "serial", + [], + [], ], [ "corruption_test", "db/corruption_test.cc", "serial", + [], + [], ], [ "crc32c_test", "util/crc32c_test.cc", "serial", + [], + [], ], [ "cuckoo_table_builder_test", "table/cuckoo/cuckoo_table_builder_test.cc", "serial", + [], + [], ], [ "cuckoo_table_db_test", "db/cuckoo_table_db_test.cc", "serial", + [], + [], ], [ "cuckoo_table_reader_test", "table/cuckoo/cuckoo_table_reader_test.cc", "serial", + [], + [], ], [ "data_block_hash_index_test", "table/block_based/data_block_hash_index_test.cc", "serial", + [], + [], ], [ "db_basic_test", "db/db_basic_test.cc", "serial", + [], + [], ], [ "db_blob_index_test", "db/db_blob_index_test.cc", "serial", + [], + [], ], [ "db_block_cache_test", "db/db_block_cache_test.cc", "serial", + [], + [], ], [ "db_bloom_filter_test", "db/db_bloom_filter_test.cc", "parallel", + [], + [], ], [ "db_compaction_filter_test", "db/db_compaction_filter_test.cc", "parallel", + [], + [], ], [ "db_compaction_test", "db/db_compaction_test.cc", "parallel", + [], + [], ], [ "db_dynamic_level_test", "db/db_dynamic_level_test.cc", "serial", + [], + [], ], [ "db_encryption_test", "db/db_encryption_test.cc", "serial", + [], + [], ], [ "db_flush_test", "db/db_flush_test.cc", "serial", + [], + [], ], [ "db_inplace_update_test", "db/db_inplace_update_test.cc", "serial", + [], + [], ], [ "db_io_failure_test", "db/db_io_failure_test.cc", "serial", + [], + [], ], [ "db_iter_stress_test", "db/db_iter_stress_test.cc", "serial", + [], + [], ], [ "db_iter_test", "db/db_iter_test.cc", "serial", + [], + [], ], [ "db_iterator_test", "db/db_iterator_test.cc", "serial", + [], + [], ], [ "db_log_iter_test", "db/db_log_iter_test.cc", "serial", + [], + [], ], [ "db_memtable_test", "db/db_memtable_test.cc", "serial", + [], + [], ], [ "db_merge_operator_test", "db/db_merge_operator_test.cc", "parallel", + [], + [], ], [ "db_options_test", "db/db_options_test.cc", "serial", + [], + [], ], [ "db_properties_test", "db/db_properties_test.cc", "serial", + [], + [], ], [ "db_range_del_test", "db/db_range_del_test.cc", "serial", + [], + [], ], [ "db_secondary_test", "db/db_impl/db_secondary_test.cc", "serial", + [], + [], ], [ "db_sst_test", "db/db_sst_test.cc", "parallel", + [], + [], ], [ "db_statistics_test", "db/db_statistics_test.cc", "serial", + [], + [], ], [ "db_table_properties_test", "db/db_table_properties_test.cc", "serial", + [], + [], ], [ "db_tailing_iter_test", "db/db_tailing_iter_test.cc", "serial", + [], + [], ], [ "db_test", "db/db_test.cc", "parallel", + [], + [], ], [ "db_test2", "db/db_test2.cc", "serial", + [], + [], ], [ "db_universal_compaction_test", "db/db_universal_compaction_test.cc", "parallel", + [], + [], ], [ "db_wal_test", "db/db_wal_test.cc", "parallel", + [], + [], ], [ "db_write_test", "db/db_write_test.cc", "serial", + [], + [], ], [ "dbformat_test", "db/dbformat_test.cc", "serial", + [], + [], ], [ "delete_scheduler_test", "file/delete_scheduler_test.cc", "serial", + [], + [], ], [ "deletefile_test", "db/deletefile_test.cc", "serial", + [], + [], ], [ "dynamic_bloom_test", "util/dynamic_bloom_test.cc", "serial", + [], + [], ], [ "env_basic_test", "env/env_basic_test.cc", "serial", + [], + [], ], [ "env_logger_test", "logging/env_logger_test.cc", "serial", + [], + [], ], [ "env_test", "env/env_test.cc", "serial", + [], + [], ], [ "env_timed_test", "utilities/env_timed_test.cc", "serial", + [], + [], ], [ "error_handler_test", "db/error_handler_test.cc", "serial", + [], + [], ], [ "event_logger_test", "logging/event_logger_test.cc", "serial", + [], + [], ], [ "external_sst_file_basic_test", "db/external_sst_file_basic_test.cc", "serial", + [], + [], ], [ "external_sst_file_test", "db/external_sst_file_test.cc", "parallel", + [], + [], ], [ "fault_injection_test", "db/fault_injection_test.cc", "parallel", + [], + [], ], [ "file_indexer_test", "db/file_indexer_test.cc", "serial", + [], + [], ], [ "file_reader_writer_test", "util/file_reader_writer_test.cc", "parallel", + [], + [], ], [ "filelock_test", "util/filelock_test.cc", "serial", + [], + [], ], [ "filename_test", "db/filename_test.cc", "serial", + [], + [], ], [ "flush_job_test", "db/flush_job_test.cc", "serial", + [], + [], ], [ "full_filter_block_test", "table/block_based/full_filter_block_test.cc", "serial", + [], + [], ], [ "hash_table_test", "utilities/persistent_cache/hash_table_test.cc", "serial", + [], + [], ], [ "hash_test", "util/hash_test.cc", "serial", + [], + [], ], [ "heap_test", "util/heap_test.cc", "serial", + [], + [], ], [ "histogram_test", "monitoring/histogram_test.cc", "serial", + [], + [], ], [ "import_column_family_test", "db/import_column_family_test.cc", "parallel", + [], + [], ], [ "inlineskiplist_test", "memtable/inlineskiplist_test.cc", "parallel", + [], + [], ], [ "iostats_context_test", "monitoring/iostats_context_test.cc", "serial", + [], + [], ], [ "ldb_cmd_test", "tools/ldb_cmd_test.cc", "serial", + [], + [], ], [ "listener_test", "db/listener_test.cc", "serial", + [], + [], ], [ "log_test", "db/log_test.cc", "serial", + [], + [], ], [ "lru_cache_test", "cache/lru_cache_test.cc", "serial", + [], + [], ], [ "manual_compaction_test", "db/manual_compaction_test.cc", "parallel", + [], + [], ], [ "memory_test", "utilities/memory/memory_test.cc", "serial", + [], + [], ], [ "memtable_list_test", "db/memtable_list_test.cc", "serial", + [], + [], ], [ "merge_helper_test", "db/merge_helper_test.cc", "serial", + [], + [], ], [ "merge_test", "db/merge_test.cc", "serial", + [], + [], ], [ "merger_test", "table/merger_test.cc", "serial", + [], + [], ], [ "mock_env_test", "env/mock_env_test.cc", "serial", + [], + [], ], [ "object_registry_test", "utilities/object_registry_test.cc", "serial", + [], + [], ], [ "obsolete_files_test", "db/obsolete_files_test.cc", "serial", + [], + [], ], [ "optimistic_transaction_test", "utilities/transactions/optimistic_transaction_test.cc", "serial", + [], + [], ], [ "option_change_migration_test", "utilities/option_change_migration/option_change_migration_test.cc", "serial", + [], + [], ], [ "options_file_test", "db/options_file_test.cc", "serial", + [], + [], ], [ "options_settable_test", "options/options_settable_test.cc", "serial", + [], + [], ], [ "options_test", "options/options_test.cc", "serial", + [], + [], ], [ "options_util_test", "utilities/options/options_util_test.cc", "serial", + [], + [], ], [ "partitioned_filter_block_test", "table/block_based/partitioned_filter_block_test.cc", "serial", + [], + [], ], [ "perf_context_test", "db/perf_context_test.cc", "serial", + [], + [], ], [ "persistent_cache_test", "utilities/persistent_cache/persistent_cache_test.cc", "parallel", + [], + [], ], [ "plain_table_db_test", "db/plain_table_db_test.cc", "serial", + [], + [], ], [ "prefix_test", "db/prefix_test.cc", "serial", + [], + [], ], [ "range_del_aggregator_test", "db/range_del_aggregator_test.cc", "serial", + [], + [], ], [ "range_tombstone_fragmenter_test", "db/range_tombstone_fragmenter_test.cc", "serial", + [], + [], ], [ "rate_limiter_test", "util/rate_limiter_test.cc", "serial", + [], + [], ], [ "reduce_levels_test", "tools/reduce_levels_test.cc", "serial", + [], + [], ], [ "repair_test", "db/repair_test.cc", "serial", + [], + [], ], [ "repeatable_thread_test", "util/repeatable_thread_test.cc", "serial", + [], + [], ], [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", "serial", + [], + [], ], [ "skiplist_test", "memtable/skiplist_test.cc", "serial", + [], + [], ], [ "slice_transform_test", "util/slice_transform_test.cc", "serial", + [], + [], ], [ "sst_dump_test", "tools/sst_dump_test.cc", "serial", + [], + [], ], [ "sst_file_reader_test", "table/sst_file_reader_test.cc", "serial", + [], + [], ], [ "statistics_test", "monitoring/statistics_test.cc", "serial", + [], + [], ], [ "stats_history_test", "monitoring/stats_history_test.cc", "serial", + [], + [], ], [ "stringappend_test", "utilities/merge_operators/string_append/stringappend_test.cc", "serial", + [], + [], ], [ "table_properties_collector_test", "db/table_properties_collector_test.cc", "serial", + [], + [], ], [ "table_test", "table/table_test.cc", "parallel", + [], + [], ], [ "thread_list_test", "util/thread_list_test.cc", "serial", + [], + [], ], [ "thread_local_test", "util/thread_local_test.cc", "serial", + [], + [], ], [ "timer_queue_test", "util/timer_queue_test.cc", "serial", + [], + [], ], [ "trace_analyzer_test", "tools/trace_analyzer_test.cc", "serial", + [], + [], ], [ "transaction_test", "utilities/transactions/transaction_test.cc", "parallel", + [], + [], ], [ "ttl_test", "utilities/ttl/ttl_test.cc", "serial", + [], + [], ], [ "util_merge_operators_test", "utilities/util_merge_operators_test.cc", "serial", + [], + [], ], [ "version_builder_test", "db/version_builder_test.cc", "serial", + [], + [], ], [ "version_edit_test", "db/version_edit_test.cc", "serial", + [], + [], ], [ "version_set_test", "db/version_set_test.cc", "serial", + [], + [], ], [ "wal_manager_test", "db/wal_manager_test.cc", "serial", + [], + [], ], [ "write_batch_test", "db/write_batch_test.cc", "serial", + [], + [], ], [ "write_batch_with_index_test", "utilities/write_batch_with_index/write_batch_with_index_test.cc", "serial", + [], + [], ], [ "write_buffer_manager_test", "memtable/write_buffer_manager_test.cc", "serial", + [], + [], ], [ "write_callback_test", "db/write_callback_test.cc", "serial", + [], + [], ], [ "write_controller_test", "db/write_controller_test.cc", "serial", + [], + [], ], [ "write_prepared_transaction_test", "utilities/transactions/write_prepared_transaction_test.cc", "parallel", + [], + [], ], [ "write_unprepared_transaction_test", "utilities/transactions/write_unprepared_transaction_test.cc", "parallel", + [], + [], ], ] @@ -1145,6 +1441,8 @@ ROCKS_TESTS = [ # will not be included. [ test_binary( + extra_compiler_flags = extra_compiler_flags, + extra_deps = extra_deps, parallelism = parallelism, rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, @@ -1155,6 +1453,6 @@ ROCKS_TESTS = [ test_cc = test_cc, test_name = test_name, ) - for test_name, test_cc, parallelism in ROCKS_TESTS + for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode ] diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index 94b63a4e8bf..fc59cf5830a 100644 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -4,12 +4,31 @@ from __future__ import print_function from __future__ import unicode_literals from targets_builder import TARGETSBuilder +import json import os import fnmatch import sys from util import ColorString +# This script generates TARGETS file for Buck. +# Buck is a build tool specifying dependencies among different build targets. +# User can pass extra dependencies as a JSON object via command line, and this +# script can include these dependencies in the generate TARGETS file. +# Usage: +# $python buckifier/buckify_rocksdb.py +# (This generates a TARGET file without user-specified dependency for unit +# tests.) +# $python buckifier/buckify_rocksdb.py \ +# '{"fake": { \ +# "extra_deps": [":test_dep", "//fakes/module:mock1"], \ +# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \ +# } \ +# }' +# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB +# unit tests, and will use the extra_compiler_flags to compile the unit test +# source.) + # tests to export as libraries for inclusion in other projects _EXPORTED_TEST_LIBS = ["env_basic_test"] @@ -86,8 +105,38 @@ def get_tests(repo_path): return tests +# Parse extra dependencies passed by user from command line +def get_dependencies(): + deps_map = { + ''.encode('ascii'): { + 'extra_deps'.encode('ascii'): [], + 'extra_compiler_flags'.encode('ascii'): [] + } + } + if len(sys.argv) < 2: + return deps_map + + def encode_dict(data): + rv = {} + for k, v in data.items(): + if isinstance(k, unicode): + k = k.encode('ascii') + if isinstance(v, unicode): + v = v.encode('ascii') + elif isinstance(v, list): + v = [x.encode('ascii') for x in v] + elif isinstance(v, dict): + v = encode_dict(v) + rv[k] = v + return rv + extra_deps = json.loads(sys.argv[1], object_hook=encode_dict) + for target_alias, deps in extra_deps.items(): + deps_map[target_alias] = deps + return deps_map + + # Prepare TARGETS file for buck -def generate_targets(repo_path): +def generate_targets(repo_path, deps_map): print(ColorString.info("Generating TARGETS")) # parsed src.mk file src_mk = parse_src_mk(repo_path) @@ -121,24 +170,33 @@ def generate_targets(repo_path): ["test_util/testutil.cc"], [":rocksdb_lib"]) + print("Extra dependencies:\n{0}".format(str(deps_map))) # test for every test we found in the Makefile - for test in sorted(tests): - match_src = [src for src in cc_files if ("/%s.c" % test) in src] - if len(match_src) == 0: - print(ColorString.warning("Cannot find .cc file for %s" % test)) - continue - elif len(match_src) > 1: - print(ColorString.warning("Found more than one .cc for %s" % test)) - print(match_src) - continue - - assert(len(match_src) == 1) - is_parallel = tests[test] - TARGETS.register_test(test, match_src[0], is_parallel) - - if test in _EXPORTED_TEST_LIBS: - test_library = "%s_lib" % test - TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"]) + for target_alias, deps in deps_map.items(): + for test in sorted(tests): + match_src = [src for src in cc_files if ("/%s.c" % test) in src] + if len(match_src) == 0: + print(ColorString.warning("Cannot find .cc file for %s" % test)) + continue + elif len(match_src) > 1: + print(ColorString.warning("Found more than one .cc for %s" % test)) + print(match_src) + continue + + assert(len(match_src) == 1) + is_parallel = tests[test] + test_target_name = \ + test if not target_alias else test + "_" + target_alias + TARGETS.register_test( + test_target_name, + match_src[0], + is_parallel, + deps['extra_deps'], + deps['extra_compiler_flags']) + + if test in _EXPORTED_TEST_LIBS: + test_library = "%s_lib" % test_target_name + TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"]) TARGETS.flush_tests() print(ColorString.info("Generated TARGETS Summary:")) @@ -163,8 +221,9 @@ def exit_with_error(msg): def main(): + deps_map = get_dependencies() # Generate TARGETS file for buck - ok = generate_targets(get_rocksdb_path()) + ok = generate_targets(get_rocksdb_path(), deps_map) if not ok: exit_with_error("Failed to generate TARGETS files") diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index 493cd8a8a8a..78db6a169b3 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -51,14 +51,21 @@ def add_binary(self, name, srcs, deps=None): pretty_list(deps))) self.total_bin = self.total_bin + 1 - def register_test(self, test_name, src, is_parallel): + def register_test(self, + test_name, + src, + is_parallel, + extra_deps, + extra_compiler_flags): exec_mode = "serial" if is_parallel: exec_mode = "parallel" self.tests_cfg += targets_cfg.test_cfg_template % ( test_name, str(src), - str(exec_mode)) + str(exec_mode), + extra_deps, + extra_compiler_flags) self.total_test = self.total_test + 1 diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 0ebd6d9427e..19ea777270d 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -140,11 +140,13 @@ "%s", "%s", "%s", + %s, + %s, ], """ unittests_template = """ -# [test_name, test_src, test_type] +# [test_name, test_src, test_type, extra_deps, extra_compiler_flags] ROCKS_TESTS = [ %s] @@ -153,6 +155,8 @@ # will not be included. [ test_binary( + extra_compiler_flags = extra_compiler_flags, + extra_deps = extra_deps, parallelism = parallelism, rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, @@ -163,7 +167,7 @@ test_cc = test_cc, test_name = test_name, ) - for test_name, test_cc, parallelism in ROCKS_TESTS + for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode ] """ diff --git a/defs.bzl b/defs.bzl index a9f25ebcc42..d5b7b6af718 100644 --- a/defs.bzl +++ b/defs.bzl @@ -12,7 +12,9 @@ def test_binary( rocksdb_compiler_flags, rocksdb_preprocessor_flags, rocksdb_external_deps, - rocksdb_os_deps): + rocksdb_os_deps, + extra_deps, + extra_compiler_flags): TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh" ttype = "gtest" if parallelism == "parallel" else "simple" @@ -23,9 +25,9 @@ def test_binary( srcs = [test_cc], arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, os_preprocessor_flags = rocksdb_os_preprocessor_flags, - compiler_flags = rocksdb_compiler_flags, + compiler_flags = rocksdb_compiler_flags + extra_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, - deps = [":rocksdb_test_lib"], + deps = [":rocksdb_test_lib"] + extra_deps, os_deps = rocksdb_os_deps, external_deps = rocksdb_external_deps, ) From e579e32eaa33ba368c7b1d4de61da6ae4c7b1351 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 5 Aug 2019 13:30:31 -0700 Subject: [PATCH 277/572] Disable ReadYourOwnWriteStress when run under Valgrind (#5671) Summary: It sometimes times out when run under valgrind taking around 20m. The patch skips the test under Valgrind. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5671 Differential Revision: D16652382 Pulled By: maysamyabandeh fbshipit-source-id: 0f6f4f76d37337d56226b689e01b14523dd07aae --- utilities/transactions/write_unprepared_transaction_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index feaedea067f..e9d305c69e9 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -115,6 +115,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { } } +#ifndef ROCKSDB_VALGRIND_RUN TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) { // This is a stress test where different threads are writing random keys, and // then before committing or aborting the transaction, it validates to see @@ -294,6 +295,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) { } } } +#endif // ROCKSDB_VALGRIND_RUN // This tests how write unprepared behaves during recovery when the DB crashes // after a transaction has either been unprepared or prepared, and tests if From 208556ee13306050f20cfddb4eac6cdcc2b1c850 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Mon, 5 Aug 2019 13:30:56 -0700 Subject: [PATCH 278/572] WritePrepared: fix Get without snapshot (#5664) Summary: if read_options.snapshot is not set, ::Get will take the last sequence number after taking a super-version and uses that as the sequence number. Theoretically max_eviceted_seq_ could advance this sequence number. This could lead ::IsInSnapshot that will be invoked by the ReadCallback to notice the absence of the snapshot. In this case, the ReadCallback should have passed a non-value to snap_released so that it could be set by the ::IsInSnapshot. The patch does that, and adds a unit test to verify it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5664 Differential Revision: D16614033 Pulled By: maysamyabandeh fbshipit-source-id: 06fb3fd4aacd75806ed1a1acec7961f5d02486f2 --- include/rocksdb/statistics.h | 2 + java/rocksjni/portal.h | 4 ++ .../src/main/java/org/rocksdb/TickerType.java | 5 ++ monitoring/statistics.cc | 1 + .../write_prepared_transaction_test.cc | 60 +++++++++++++++- utilities/transactions/write_prepared_txn.cc | 36 ++++++---- .../transactions/write_prepared_txn_db.cc | 14 ++-- .../transactions/write_prepared_txn_db.h | 68 ++++++++++++++----- .../transactions/write_unprepared_txn.cc | 32 ++++++--- utilities/transactions/write_unprepared_txn.h | 23 ++++++- .../transactions/write_unprepared_txn_db.cc | 3 +- 11 files changed, 199 insertions(+), 49 deletions(-) diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index a8d01e03415..b6b78ef99a3 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -324,6 +324,8 @@ enum Tickers : uint32_t { TXN_DUPLICATE_KEY_OVERHEAD, // # of times snapshot_mutex_ is acquired in the fast path. TXN_SNAPSHOT_MUTEX_OVERHEAD, + // # of times ::Get returned TryAgain due to expired snapshot seq + TXN_GET_TRY_AGAIN, // Number of keys actually found in MultiGet calls (vs number requested by // caller) diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 667af809bdc..e9dc3fb82b1 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -4620,6 +4620,8 @@ class TickerTypeJni { return -0x0B; case rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD: return -0x0C; + case rocksdb::Tickers::TXN_GET_TRY_AGAIN: + return -0x0D; case rocksdb::Tickers::TICKER_ENUM_MAX: // 0x5F for backwards compatibility on current minor version. return 0x5F; @@ -4912,6 +4914,8 @@ class TickerTypeJni { return rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD; case -0x0C: return rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; + case -0x0D: + return rocksdb::Tickers::TXN_GET_TRY_AGAIN; case 0x5F: // 0x5F for backwards compatibility on current minor version. return rocksdb::Tickers::TICKER_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 551e366dc53..40a642bd666 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -717,6 +717,11 @@ public enum TickerType { */ TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C), + /** + * # of times ::Get returned TryAgain due to expired snapshot seq + */ + TXN_GET_TRY_AGAIN((byte) -0x0D), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 15d702d1f4a..70c993b201a 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -162,6 +162,7 @@ const std::vector> TickersNameMap = { "rocksdb.txn.overhead.mutex.old.commit.map"}, {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"}, {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"}, + {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"}, {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"}, {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"}, {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"}, diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index ef89aaeb8c7..2cb91f0d350 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1372,7 +1372,7 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) { for (int i = 0; i < writes; i++) { WriteBatch batch; // For duplicate keys cause 4 commit entries, each evicting an entry that - // is not published yet, thus causing max ecited seq go higher than last + // is not published yet, thus causing max evicted seq go higher than last // published. for (int b = 0; b < batch_cnt; b++) { batch.Put("foo", "foo"); @@ -1404,6 +1404,64 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) { db->ReleaseSnapshot(snap); } +// Test that reads without snapshots would not hit an undefined state +TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ReOpen(); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast(db); + + const int writes = 50; + rocksdb::port::Thread t1([&]() { + for (int i = 0; i < writes; i++) { + WriteBatch batch; + batch.Put("key", "foo"); + db->Write(woptions, &batch); + } + }); + + rocksdb::port::Thread t2([&]() { + while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread + std::this_thread::yield(); + } + ReadOptions ropt; + PinnableSlice pinnable_val; + TransactionOptions txn_options; + for (int i = 0; i < 10; i++) { + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + std::vector values; + auto s_vec = + txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + Slice key("key"); + txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, + &s, true); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + delete txn; + } + }); + + t1.join(); + t2.join(); + + // Make sure that the test has worked and seq number has advanced as we + // thought + auto snap = db->GetSnapshot(); + ASSERT_GT(snap->GetSequenceNumber(), writes - 1); + db->ReleaseSnapshot(snap); +} + // Check that old_commit_map_ cleanup works correctly if the snapshot equals // max_evicted_seq_. TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) { diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 97bebac5d57..188f61120be 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -46,13 +46,16 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, PinnableSlice* values, Status* statuses, bool sorted_input) { SequenceNumber min_uncommitted, snap_seq; - const bool backed_by_snapshot = + const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); - WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, keys, values, statuses, sorted_input, &callback); - if (UNLIKELY(!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + if (UNLIKELY(!callback.valid() || + !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); for (size_t i = 0; i < num_keys; i++) { statuses[i] = Status::TryAgain(); } @@ -63,15 +66,18 @@ Status WritePreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { SequenceNumber min_uncommitted, snap_seq; - const bool backed_by_snapshot = + const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); - WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, pinnable_val, &callback); - if (LIKELY(wpt_db_->ValidateSnapshot(callback.max_visible_seq(), + if (LIKELY(callback.valid() && + wpt_db_->ValidateSnapshot(callback.max_visible_seq(), backed_by_snapshot))) { return res; } else { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); return Status::TryAgain(); } } @@ -241,9 +247,11 @@ Status WritePreparedTxn::RollbackInternal() { auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); struct RollbackWriteBatchBuilder : public WriteBatch::Handler { DBImpl* db_; - ReadOptions roptions; WritePreparedTxnReadCallback callback; WriteBatch* rollback_batch_; std::map& comparators_; @@ -251,18 +259,20 @@ Status WritePreparedTxn::RollbackInternal() { using CFKeys = std::set; std::map keys_; bool rollback_merge_operands_; + ReadOptions roptions_; RollbackWriteBatchBuilder( DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq, WriteBatch* dst_batch, std::map& comparators, std::map& handles, - bool rollback_merge_operands) + bool rollback_merge_operands, ReadOptions _roptions) : db_(db), callback(wpt_db, snap_seq), // disable min_uncommitted optimization rollback_batch_(dst_batch), comparators_(comparators), handles_(handles), - rollback_merge_operands_(rollback_merge_operands) {} + rollback_merge_operands_(rollback_merge_operands), + roptions_(_roptions) {} Status Rollback(uint32_t cf, const Slice& key) { Status s; @@ -280,7 +290,7 @@ Status WritePreparedTxn::RollbackInternal() { PinnableSlice pinnable_val; bool not_used; auto cf_handle = handles_[cf]; - s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used, + s = db_->GetImpl(roptions_, cf_handle, key, &pinnable_val, ¬_used, &callback); assert(s.ok() || s.IsNotFound()); if (s.ok()) { @@ -330,7 +340,8 @@ Status WritePreparedTxn::RollbackInternal() { bool WriteAfterCommit() const override { return false; } } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch, *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), - wpt_db_->txn_db_options_.rollback_merge_operands); + wpt_db_->txn_db_options_.rollback_merge_operands, + roptions); auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); assert(s.ok()); if (!s.ok()) { @@ -434,7 +445,8 @@ Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, ColumnFamilyHandle* cfh = column_family ? column_family : db_impl_->DefaultColumnFamily(); - WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted); + WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted, + kBackedByDBSnapshot); return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(), snap_seq, false /* cache_only */, &snap_checker, min_uncommitted); diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index a3b523a22cf..e6d71020685 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -226,16 +226,18 @@ Status WritePreparedTxnDB::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { SequenceNumber min_uncommitted, snap_seq; - const bool backed_by_snapshot = + const SnapshotBackup backed_by_snapshot = AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); - WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted); + WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted, + backed_by_snapshot); bool* dont_care = nullptr; auto res = db_impl_->GetImpl(options, column_family, key, value, dont_care, &callback); - if (LIKELY( - ValidateSnapshot(callback.max_visible_seq(), backed_by_snapshot))) { + if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { return res; } else { + WPRecordTick(TXN_GET_TRY_AGAIN); return Status::TryAgain(); } } @@ -298,7 +300,8 @@ struct WritePreparedTxnDB::IteratorState { IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, std::shared_ptr s, SequenceNumber min_uncommitted) - : callback(txn_db, sequence, min_uncommitted), snapshot(s) {} + : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot), + snapshot(s) {} WritePreparedTxnReadCallback callback; std::shared_ptr snapshot; @@ -392,6 +395,7 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) { new std::atomic[SNAPSHOT_CACHE_SIZE] {}); commit_cache_ = std::unique_ptr[]>( new std::atomic[COMMIT_CACHE_SIZE] {}); + dummy_max_snapshot_.number_ = kMaxSequenceNumber; } void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max, diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 9561bfada17..4ee7d8e6cf8 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -30,6 +30,7 @@ #include "utilities/transactions/write_prepared_txn.h" namespace rocksdb { +enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot }; // A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC. // In this way some data in the DB might not be committed. The DB provides @@ -448,18 +449,21 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { const ColumnFamilyOptions& cf_options) override; // Assign the min and max sequence numbers for reading from the db. A seq > // max is not valid, and a seq < min is valid, and a min <= seq < max requires - // further checkings. Normally max is defined by the snapshot and min is by + // further checking. Normally max is defined by the snapshot and min is by // minimum uncommitted seq. - inline bool AssignMinMaxSeqs(const Snapshot* snapshot, SequenceNumber* min, - SequenceNumber* max); + inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max); // Validate is a snapshot sequence number is still valid based on the latest // db status. backed_by_snapshot specifies if the number is baked by an actual // snapshot object. order specified the memory order with which we load the // atomic variables: relax is enough for the default since we care about last // value seen by same thread. inline bool ValidateSnapshot( - const SequenceNumber snap_seq, const bool backed_by_snapshot, + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, std::memory_order order = std::memory_order_relaxed); + // Get a dummy snapshot that refers to kMaxSequenceNumber + Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; } private: friend class AddPreparedCallback; @@ -488,6 +492,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { friend class WritePreparedTransactionTest_IsInSnapshotTest_Test; friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test; friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test; friend class WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test; friend class @@ -783,26 +788,55 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // Thread safety: since the handle is read-only object it is a const it is // safe to read it concurrently std::shared_ptr> handle_map_; + // A dummy snapshot object that refers to kMaxSequenceNumber + SnapshotImpl dummy_max_snapshot_; }; class WritePreparedTxnReadCallback : public ReadCallback { public: WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot) - : ReadCallback(snapshot), db_(db) {} + : ReadCallback(snapshot), + db_(db), + backed_by_snapshot_(kBackedByDBSnapshot) {} WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot, - SequenceNumber min_uncommitted) - : ReadCallback(snapshot, min_uncommitted), db_(db) {} + SequenceNumber min_uncommitted, + SnapshotBackup backed_by_snapshot) + : ReadCallback(snapshot, min_uncommitted), + db_(db), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WritePreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } // Will be called to see if the seq number visible; if not it moves on to // the next seq number. inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override { auto snapshot = max_visible_seq_; - return db_->IsInSnapshot(seq, snapshot, min_uncommitted_); + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; + } + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; } // TODO(myabandeh): override Refresh when Iterator::Refresh is supported private: WritePreparedTxnDB* db_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; }; class AddPreparedCallback : public PreReleaseCallback { @@ -1034,26 +1068,26 @@ struct SubBatchCounter : public WriteBatch::Handler { bool WriteAfterCommit() const override { return false; } }; -bool WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot, - SequenceNumber* min, - SequenceNumber* max) { +SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max) { if (snapshot != nullptr) { *min = static_cast_with_check(snapshot) ->min_uncommitted_; *max = static_cast_with_check(snapshot) ->number_; - return true; + return kBackedByDBSnapshot; } else { *min = SmallestUnCommittedSeq(); *max = 0; // to be assigned later after sv is referenced. - return false; + return kUnbackedByDBSnapshot; } } -bool WritePreparedTxnDB::ValidateSnapshot(const SequenceNumber snap_seq, - const bool backed_by_snapshot, - std::memory_order order) { - if (backed_by_snapshot) { +bool WritePreparedTxnDB::ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order) { + if (backed_by_snapshot == kBackedByDBSnapshot) { return true; } else { SequenceNumber max = max_evicted_seq_.load(order); diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 993c3b8b60c..a1862d32d44 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -25,7 +25,11 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { } } - return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_); + bool snap_released = false; + auto ret = db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; } WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, @@ -547,8 +551,9 @@ Status WriteUnpreparedTxn::RollbackInternal() { Status s; const auto& cf_map = *wupt_db_->GetCFHandleMap(); auto read_at_seq = kMaxSequenceNumber; - ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); // Note that we do not use WriteUnpreparedTxnReadCallback because we do not // need to read our own writes when reading prior versions of the key for // rollback. @@ -704,7 +709,8 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() { ->min_uncommitted_; SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber(); WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, - top.unprep_seqs_); + top.unprep_seqs_, + kBackedByDBSnapshot); const auto& cf_map = *wupt_db_->GetCFHandleMap(); for (const auto& cfkey : tracked_keys) { const auto cfid = cfkey.first; @@ -784,14 +790,16 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, PinnableSlice* values, Status* statuses, bool sorted_input) { SequenceNumber min_uncommitted, snap_seq; - const bool backed_by_snapshot = + const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, - unprep_seqs_); + unprep_seqs_, backed_by_snapshot); write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, keys, values, statuses, sorted_input, &callback); - if (UNLIKELY(!wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + if (UNLIKELY(!callback.valid() || + !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); for (size_t i = 0; i < num_keys; i++) { statuses[i] = Status::TryAgain(); } @@ -802,15 +810,17 @@ Status WriteUnpreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { SequenceNumber min_uncommitted, snap_seq; - const bool backed_by_snapshot = + const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, - unprep_seqs_); + unprep_seqs_, backed_by_snapshot); auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, value, &callback); - if (LIKELY(wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + if (LIKELY(callback.valid() && + wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { return res; } else { + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); return Status::TryAgain(); } } @@ -854,8 +864,8 @@ Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, ColumnFamilyHandle* cfh = column_family ? column_family : db_impl_->DefaultColumnFamily(); - WriteUnpreparedTxnReadCallback snap_checker(wupt_db_, snap_seq, - min_uncommitted, unprep_seqs_); + WriteUnpreparedTxnReadCallback snap_checker( + wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot); return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(), snap_seq, false /* cache_only */, &snap_checker, min_uncommitted); diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 774d90e8d37..5c654b05ba8 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -56,7 +56,8 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { WriteUnpreparedTxnReadCallback( WritePreparedTxnDB* db, SequenceNumber snapshot, SequenceNumber min_uncommitted, - const std::map& unprep_seqs) + const std::map& unprep_seqs, + SnapshotBackup backed_by_snapshot) // Pass our last uncommitted seq as the snapshot to the parent class to // ensure that the parent will not prematurely filter out own writes. We // will do the exact comparison against snapshots in IsVisibleFullCheck @@ -64,10 +65,23 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted), db_(db), unprep_seqs_(unprep_seqs), - wup_snapshot_(snapshot) {} + wup_snapshot_(snapshot), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WriteUnpreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } virtual bool IsVisibleFullCheck(SequenceNumber seq) override; + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + void Refresh(SequenceNumber seq) override { max_visible_seq_ = std::max(max_visible_seq_, seq); wup_snapshot_ = seq; @@ -88,6 +102,11 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { WritePreparedTxnDB* db_; const std::map& unprep_seqs_; SequenceNumber wup_snapshot_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; }; class WriteUnpreparedTxn : public WritePreparedTxn { diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 4381619e782..defaf9fce6e 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -348,7 +348,8 @@ struct WriteUnpreparedTxnDB::IteratorState { IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, std::shared_ptr s, SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn) - : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_), + : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_, + kBackedByDBSnapshot), snapshot(s) {} SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); } From b1a02ffeabb3ad3edceddf31f88c7543f01a03d4 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 5 Aug 2019 15:40:31 -0700 Subject: [PATCH 279/572] Fix make target 'all' and 'check' (#5672) Summary: If a test is one of parallel tests, then it should also be one of the 'tests'. Otherwise, `make all` won't build the binaries. For examle, ``` $COMPILE_WITH_ASAN=1 make -j32 all ``` Then if you do ``` $make check ``` The second command will invoke the compilation and building for db_bloom_test and file_reader_writer_test **without** the `COMPILE_WITH_ASAN=1`, causing the command to fail. Test plan (on devserver): ``` $make -j32 all ``` Verify all binaries are built so that `make check` won't have to compile any thing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5672 Differential Revision: D16655834 Pulled By: riversand963 fbshipit-source-id: 050131412b5313496f85ae3deeeeb8d28af75746 --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index fbe6d2d06ff..4502be8e46b 100644 --- a/Makefile +++ b/Makefile @@ -445,6 +445,7 @@ TESTS = \ db_iter_test \ db_iter_stress_test \ db_log_iter_test \ + db_bloom_filter_test \ db_compaction_filter_test \ db_compaction_test \ db_dynamic_level_test \ @@ -479,6 +480,7 @@ TESTS = \ fault_injection_test \ filelock_test \ filename_test \ + file_reader_writer_test \ block_based_filter_block_test \ full_filter_block_test \ partitioned_filter_block_test \ From f4a616ebf9e4417fe74e459ae58e4d31642bafcb Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Mon, 5 Aug 2019 18:31:42 -0700 Subject: [PATCH 280/572] Block cache analyzer: python script to plot graphs (#5673) Summary: This PR updated the python script to plot graphs for stats output from block cache analyzer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5673 Test Plan: Manually run the script to generate graphs. Differential Revision: D16657145 Pulled By: HaoyuHuang fbshipit-source-id: fd510b5fd4307835f9a986fac545734dbe003d28 --- .../block_cache_trace_analyzer_plot.py | 402 ++++++++++++++++-- 1 file changed, 360 insertions(+), 42 deletions(-) diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py index 22d56b932c5..0fdaa41586e 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py @@ -1,12 +1,17 @@ #!/usr/bin/env python3 import csv +import math import os import random import sys +import matplotlib +matplotlib.use("Agg") import matplotlib.backends.backend_pdf import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import seaborn as sns # Make sure a legend has the same color across all generated graphs. @@ -19,7 +24,7 @@ def get_cmap(n, name="hsv"): color_index = 0 bar_color_maps = {} colors = [] -n_colors = 60 +n_colors = 360 linear_colors = get_cmap(n_colors) for i in range(n_colors): colors.append(linear_colors(i)) @@ -35,41 +40,95 @@ def num_to_gb(n): return "{0:.2f}".format(float(n) / one_gb) -def plot_miss_ratio_graphs(csv_result_dir, output_result_dir): - mrc_file_path = csv_result_dir + "/mrc" - if not os.path.exists(mrc_file_path): - return +def plot_miss_stats_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): miss_ratios = {} - print("Processing file {}".format(mrc_file_path)) - with open(mrc_file_path, "r") as csvfile: - rows = csv.reader(csvfile, delimiter=",") - is_header = False - for row in rows: - if not is_header: - is_header = True - continue - cache_name = row[0] - num_shard_bits = int(row[1]) - ghost_capacity = int(row[2]) - capacity = int(row[3]) - miss_ratio = float(row[4]) - config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) - if config not in miss_ratios: - miss_ratios[config] = {} - miss_ratios[config]["x"] = [] - miss_ratios[config]["y"] = [] - miss_ratios[config]["x"].append(num_to_gb(capacity)) - miss_ratios[config]["y"].append(miss_ratio) + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + fig = plt.figure() + for config in miss_ratios: + plt.plot( + miss_ratios[config]["x"], miss_ratios[config]["y"], label=config + ) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.ylim(ymin=0) + plt.title("{}".format(file)) + plt.legend() + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) + + +def plot_miss_stats_diff_lru_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): + miss_ratios = {} + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + if "lru-0-0" not in miss_ratios: + return fig = plt.figure() for config in miss_ratios: - plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config) - plt.xlabel("Cache capacity (GB)") - plt.ylabel("Miss Ratio (%)") - # plt.xscale('log', basex=2) - plt.ylim(ymin=0) - plt.title("RocksDB block cache miss ratios") + diffs = [0] * len(miss_ratios["lru-0-0"]["x"]) + for i in range(len(miss_ratios["lru-0-0"]["x"])): + for j in range(len(miss_ratios[config]["x"])): + if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]: + diffs[i] = ( + miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i] + ) + break + plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.title("{}".format(file)) plt.legend() - fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight") + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) def sanitize(label): @@ -143,6 +202,7 @@ def read_data_for_plot(csvfile, vertical): def plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix, filename_suffix, pdf_name, xlabel, @@ -151,11 +211,14 @@ def plot_line_charts( vertical, legend, ): + global color_index, bar_color_maps, colors pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name) for file in os.listdir(csv_result_dir): if not file.endswith(filename_suffix): continue - print("Processing file {}".format(file)) + if not file.startswith(filename_prefix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) with open(csv_result_dir + "/" + file, "r") as csvfile: x, labels, label_stats = read_data_for_plot(csvfile, vertical) if len(x) == 0 or len(labels) == 0: @@ -163,10 +226,15 @@ def plot_line_charts( # plot figure fig = plt.figure() for label_index in label_stats: + # Assign a unique color to this label. + if labels[label_index] not in bar_color_maps: + bar_color_maps[labels[label_index]] = colors[color_index] + color_index += 1 plt.plot( - [int(x[i]) for i in range(len(x))], - label_stats[label_index], + [int(x[i]) for i in range(len(x) - 1)], + label_stats[label_index][:-1], label=labels[label_index], + color=bar_color_maps[labels[label_index]], ) # Translate time unit into x labels. @@ -239,10 +307,29 @@ def plot_stacked_bar_charts( pdf.close() -def plot_access_timeline(csv_result_dir, output_result_dir): +def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title): + pdf = matplotlib.backends.backend_pdf.PdfPages( + "{}/{}".format(output_result_dir, pdf_name) + ) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + corr_table = corr_table.pivot("label", "corr", "value") + fig = plt.figure() + sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2") + plt.title("{} filename:{}".format(title, file)) + pdf.savefig(fig) + pdf.close() + + +def plot_timeline(csv_result_dir, output_result_dir): plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix="", filename_suffix="access_timeline", pdf_name="access_time.pdf", xlabel="Time", @@ -253,6 +340,109 @@ def plot_access_timeline(csv_result_dir, output_result_dir): ) +def convert_to_0_if_nan(n): + if math.isnan(n): + return 0.0 + return n + + +def plot_correlation(csv_result_dir, output_result_dir): + # Processing the correlation input first. + label_str_file = {} + for file in os.listdir(csv_result_dir): + if not file.endswith("correlation_input"): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + label_str = file.split("_")[0] + label = file[len(label_str) + 1 :] + label = label[: len(label) - len("_correlation_input")] + + output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str) + if output_file not in label_str_file: + f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+") + label_str_file[output_file] = f + f.write("label,corr,value\n") + f = label_str_file[output_file] + f.write( + "{},{},{}\n".format( + label, + "LA+A", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+A", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+A", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LA+T", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+T", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+T", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + for label_str in label_str_file: + label_str_file[label_str].close() + + plot_heatmap( + csv_result_dir, + output_result_dir, + "correlation_output", + "correlation.pdf", + "Correlation", + ) + + def plot_reuse_graphs(csv_result_dir, output_result_dir): plot_stacked_bar_charts( csv_result_dir, @@ -301,6 +491,7 @@ def plot_reuse_graphs(csv_result_dir, output_result_dir): plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix="", filename_suffix="reuse_blocks_timeline", pdf_name="reuse_blocks_timeline.pdf", xlabel="", @@ -370,14 +561,90 @@ def plot_access_count_summary(csv_result_dir, output_result_dir): vertical=True, x_prefix="< ", ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="skewness", + pdf_name="skew.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="Skewness", + vertical=True, + legend=False, + ) + + +def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_ratio_timeline", + pdf_name="miss_ratio_timeline.pdf", + xlabel="Time", + ylabel="Miss Ratio (%)", + title="Miss ratio timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_timeline", + pdf_name="policy_timeline.pdf", + xlabel="Time", + ylabel="# of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_ratio_timeline", + pdf_name="policy_ratio_timeline.pdf", + xlabel="Time", + ylabel="Percentage of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) if __name__ == "__main__": if len(sys.argv) < 3: print( - "Must provide two arguments: 1) The directory that saves a list of " - "directories which contain block cache trace analyzer result files " - "2) the directory to save plotted graphs." + "Must provide two arguments: \n" + "1) The directory that saves a list of " + "directories which contain block cache trace analyzer result files. \n" + "2) the directory to save plotted graphs. \n" ) exit(1) csv_result_dir = sys.argv[1] @@ -396,8 +663,59 @@ def plot_access_count_summary(csv_result_dir, output_result_dir): print("Processing experiment dir: {}".format(csv_relative_dir)) if not os.path.exists(result_dir): os.makedirs(result_dir) - plot_miss_ratio_graphs(csv_abs_dir, result_dir) - plot_access_timeline(csv_abs_dir, result_dir) + plot_access_count_summary(csv_abs_dir, result_dir) + plot_timeline(csv_abs_dir, result_dir) + plot_miss_ratio_timeline(csv_result_dir, output_result_dir) + plot_correlation(csv_abs_dir, result_dir) plot_reuse_graphs(csv_abs_dir, result_dir) plot_percentage_access_summary(csv_abs_dir, result_dir) - plot_access_count_summary(csv_abs_dir, result_dir) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc", + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc_diff_lru", + ) + # The following stats are only available in pysim. + for time_unit in ["1", "60", "3600"]: + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit), + ) From cc9fa7fcdb35fdd12505053b2a6cd38140c93d3b Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Mon, 5 Aug 2019 19:47:33 -0700 Subject: [PATCH 281/572] cmake: cmake related cleanups (#5662) Summary: - cmake: use the builtin FindBzip2.cmake from CMake - cmake: require CMake v3.5.1 - cmake: add imported target for 3rd party libraries - cmake: extract ReadVersion.cmake out and refactor it Pull Request resolved: https://github.com/facebook/rocksdb/pull/5662 Differential Revision: D16660974 Pulled By: maysamyabandeh fbshipit-source-id: 681594910e74253251fe14ad0befc41a4d0f4fd4 --- CMakeLists.txt | 65 ++++++++++++-------------------- cmake/modules/FindJeMalloc.cmake | 24 ++++++++---- cmake/modules/FindNUMA.cmake | 16 ++++++-- cmake/modules/FindTBB.cmake | 26 ++++++++----- cmake/modules/Findbzip2.cmake | 21 ----------- cmake/modules/Findlz4.cmake | 28 +++++++++----- cmake/modules/Findsnappy.cmake | 26 ++++++++----- cmake/modules/Findzstd.cmake | 28 +++++++++----- cmake/modules/ReadVersion.cmake | 10 +++++ 9 files changed, 132 insertions(+), 112 deletions(-) delete mode 100644 cmake/modules/Findbzip2.cmake create mode 100644 cmake/modules/ReadVersion.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 7266f3b55c8..bb99d1b7ec8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,18 +32,19 @@ # 3. cmake .. # 4. make -j -cmake_minimum_required(VERSION 2.8.12) -project(rocksdb) -enable_language(CXX) -enable_language(C) -enable_language(ASM) +cmake_minimum_required(VERSION 3.5.1) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") +include(ReadVersion) +get_rocksdb_version(rocksdb_VERSION) +project(rocksdb + VERSION ${rocksdb_VERSION} + LANGUAGES CXX C ASM) if(POLICY CMP0042) cmake_policy(SET CMP0042 NEW) endif() -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) @@ -74,8 +75,7 @@ else() if(WITH_JEMALLOC) find_package(JeMalloc REQUIRED) add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE) - include_directories(${JEMALLOC_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${JEMALLOC_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS JeMalloc::JeMalloc) endif() endif() @@ -93,43 +93,38 @@ else() if(WITH_SNAPPY) find_package(snappy REQUIRED) add_definitions(-DSNAPPY) - include_directories(${SNAPPY_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${SNAPPY_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS snappy::snappy) endif() if(WITH_ZLIB) find_package(ZLIB REQUIRED) add_definitions(-DZLIB) - if(ZLIB_INCLUDE_DIRS) - # CMake 3 - include_directories(${ZLIB_INCLUDE_DIRS}) - else() - # CMake 2 - include_directories(${ZLIB_INCLUDE_DIR}) - endif() - list(APPEND THIRDPARTY_LIBS ${ZLIB_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS ZLIB::ZLIB) endif() option(WITH_BZ2 "build with bzip2" OFF) if(WITH_BZ2) - find_package(bzip2 REQUIRED) + find_package(BZip2 REQUIRED) add_definitions(-DBZIP2) - include_directories(${BZIP2_INCLUDE_DIR}) + if(BZIP2_INCLUDE_DIRS) + include_directories(${BZIP2_INCLUDE_DIRS}) + else() + include_directories(${BZIP2_INCLUDE_DIR}) + endif() list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES}) endif() if(WITH_LZ4) find_package(lz4 REQUIRED) add_definitions(-DLZ4) - include_directories(${LZ4_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${LZ4_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS lz4::lz4) endif() if(WITH_ZSTD) find_package(zstd REQUIRED) add_definitions(-DZSTD) include_directories(${ZSTD_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${ZSTD_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS zstd::zstd) endif() endif() @@ -150,17 +145,6 @@ endif() string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") -# Read rocksdb version from version.h header file. -file(READ include/rocksdb/version.h version_header_file) -string(REGEX MATCH "#define ROCKSDB_MAJOR ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_MAJOR ${CMAKE_MATCH_1}) -string(REGEX MATCH "#define ROCKSDB_MINOR ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_MINOR ${CMAKE_MATCH_1}) -string(REGEX MATCH "#define ROCKSDB_PATCH ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_PATCH ${CMAKE_MATCH_1}) -set(ROCKSDB_VERSION ${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}) - - option(WITH_MD_LIBRARY "build with MD" ON) if(WIN32 AND MSVC) if(WITH_MD_LIBRARY) @@ -316,15 +300,14 @@ if(WITH_NUMA) find_package(NUMA REQUIRED) add_definitions(-DNUMA) include_directories(${NUMA_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${NUMA_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS NUMA::NUMA) endif() option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF) if(WITH_TBB) find_package(TBB REQUIRED) add_definitions(-DTBB) - include_directories(${TBB_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${TBB_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS TBB::TBB) endif() # Stall notifications eat some performance from inserts @@ -777,8 +760,8 @@ else() ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES LINKER_LANGUAGE CXX - VERSION ${ROCKSDB_VERSION} - SOVERSION ${ROCKSDB_VERSION_MAJOR} + VERSION ${rocksdb_VERSION} + SOVERSION ${rocksdb_VERSION_MAJOR} CXX_STANDARD 11 OUTPUT_NAME "rocksdb") endif() @@ -833,7 +816,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) write_basic_package_version_file( RocksDBConfigVersion.cmake - VERSION ${ROCKSDB_VERSION} + VERSION ${rocksdb_VERSION} COMPATIBILITY SameMajorVersion ) diff --git a/cmake/modules/FindJeMalloc.cmake b/cmake/modules/FindJeMalloc.cmake index 7911f77c4c3..f695b3ed1b3 100644 --- a/cmake/modules/FindJeMalloc.cmake +++ b/cmake/modules/FindJeMalloc.cmake @@ -1,21 +1,29 @@ # - Find JeMalloc library # Find the native JeMalloc includes and library # -# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. -# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. -# JEMALLOC_FOUND - True if jemalloc found. +# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc. +# JeMalloc_LIBRARIES - List of libraries when using jemalloc. +# JeMalloc_FOUND - True if jemalloc found. -find_path(JEMALLOC_INCLUDE_DIR +find_path(JeMalloc_INCLUDE_DIRS NAMES jemalloc/jemalloc.h HINTS ${JEMALLOC_ROOT_DIR}/include) -find_library(JEMALLOC_LIBRARIES +find_library(JeMalloc_LIBRARIES NAMES jemalloc HINTS ${JEMALLOC_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) +find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS) mark_as_advanced( - JEMALLOC_LIBRARIES - JEMALLOC_INCLUDE_DIR) + JeMalloc_LIBRARIES + JeMalloc_INCLUDE_DIRS) + +if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc)) + add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED) + set_target_properties(JeMalloc::JeMalloc + PROPERTIES + IMPORTED_LOCATION ${JeMalloc_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/FindNUMA.cmake b/cmake/modules/FindNUMA.cmake index 02760344c68..69b95c9b60b 100644 --- a/cmake/modules/FindNUMA.cmake +++ b/cmake/modules/FindNUMA.cmake @@ -1,11 +1,11 @@ # - Find NUMA # Find the NUMA library and includes # -# NUMA_INCLUDE_DIR - where to find numa.h, etc. +# NUMA_INCLUDE_DIRS - where to find numa.h, etc. # NUMA_LIBRARIES - List of libraries when using NUMA. # NUMA_FOUND - True if NUMA found. -find_path(NUMA_INCLUDE_DIR +find_path(NUMA_INCLUDE_DIRS NAMES numa.h numaif.h HINTS ${NUMA_ROOT_DIR}/include) @@ -14,8 +14,16 @@ find_library(NUMA_LIBRARIES HINTS ${NUMA_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR) +find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS) mark_as_advanced( NUMA_LIBRARIES - NUMA_INCLUDE_DIR) + NUMA_INCLUDE_DIRS) + +if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA)) + add_library (NUMA::NUMA UNKNOWN IMPORTED) + set_target_properties(NUMA::NUMA + PROPERTIES + IMPORTED_LOCATION ${NUMA_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/FindTBB.cmake b/cmake/modules/FindTBB.cmake index 556ce872b17..f6861fa5521 100644 --- a/cmake/modules/FindTBB.cmake +++ b/cmake/modules/FindTBB.cmake @@ -1,7 +1,7 @@ # - Find TBB # Find the Thread Building Blocks library and includes # -# TBB_INCLUDE_DIR - where to find tbb.h, etc. +# TBB_INCLUDE_DIRS - where to find tbb.h, etc. # TBB_LIBRARIES - List of libraries when using TBB. # TBB_FOUND - True if TBB found. @@ -9,17 +9,25 @@ if(NOT DEFINED TBB_ROOT_DIR) set(TBB_ROOT_DIR "$ENV{TBBROOT}") endif() -find_path(TBB_INCLUDE_DIR -NAMES tbb/tbb.h -HINTS ${TBB_ROOT_DIR}/include) +find_path(TBB_INCLUDE_DIRS + NAMES tbb/tbb.h + HINTS ${TBB_ROOT_DIR}/include) find_library(TBB_LIBRARIES -NAMES tbb -HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH) + NAMES tbb + HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIR) +find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS) mark_as_advanced( -TBB_LIBRARIES -TBB_INCLUDE_DIR) + TBB_LIBRARIES + TBB_INCLUDE_DIRS) + +if(TBB_FOUND AND NOT (TARGET TBB::TBB)) + add_library (TBB::TBB UNKNOWN IMPORTED) + set_target_properties(TBB::TBB + PROPERTIES + IMPORTED_LOCATION ${TBB_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findbzip2.cmake b/cmake/modules/Findbzip2.cmake deleted file mode 100644 index 87abbe941e0..00000000000 --- a/cmake/modules/Findbzip2.cmake +++ /dev/null @@ -1,21 +0,0 @@ -# - Find Bzip2 -# Find the bzip2 compression library and includes -# -# BZIP2_INCLUDE_DIR - where to find bzlib.h, etc. -# BZIP2_LIBRARIES - List of libraries when using bzip2. -# BZIP2_FOUND - True if bzip2 found. - -find_path(BZIP2_INCLUDE_DIR - NAMES bzlib.h - HINTS ${BZIP2_ROOT_DIR}/include) - -find_library(BZIP2_LIBRARIES - NAMES bz2 - HINTS ${BZIP2_ROOT_DIR}/lib) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(bzip2 DEFAULT_MSG BZIP2_LIBRARIES BZIP2_INCLUDE_DIR) - -mark_as_advanced( - BZIP2_LIBRARIES - BZIP2_INCLUDE_DIR) diff --git a/cmake/modules/Findlz4.cmake b/cmake/modules/Findlz4.cmake index c34acef5e39..7cf7d7f5fe3 100644 --- a/cmake/modules/Findlz4.cmake +++ b/cmake/modules/Findlz4.cmake @@ -1,21 +1,29 @@ # - Find Lz4 # Find the lz4 compression library and includes # -# LZ4_INCLUDE_DIR - where to find lz4.h, etc. -# LZ4_LIBRARIES - List of libraries when using lz4. -# LZ4_FOUND - True if lz4 found. +# lz4_INCLUDE_DIRS - where to find lz4.h, etc. +# lz4_LIBRARIES - List of libraries when using lz4. +# lz4_FOUND - True if lz4 found. -find_path(LZ4_INCLUDE_DIR +find_path(lz4_INCLUDE_DIRS NAMES lz4.h - HINTS ${LZ4_ROOT_DIR}/include) + HINTS ${lz4_ROOT_DIR}/include) -find_library(LZ4_LIBRARIES +find_library(lz4_LIBRARIES NAMES lz4 - HINTS ${LZ4_ROOT_DIR}/lib) + HINTS ${lz4_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(lz4 DEFAULT_MSG LZ4_LIBRARIES LZ4_INCLUDE_DIR) +find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS) mark_as_advanced( - LZ4_LIBRARIES - LZ4_INCLUDE_DIR) + lz4_LIBRARIES + lz4_INCLUDE_DIRS) + +if(lz4_FOUND AND NOT (TARGET lz4::lz4)) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties(lz4::lz4 + PROPERTIES + IMPORTED_LOCATION ${lz4_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake index 6ed5fda3d57..2de2889c1a6 100644 --- a/cmake/modules/Findsnappy.cmake +++ b/cmake/modules/Findsnappy.cmake @@ -1,21 +1,29 @@ # - Find Snappy # Find the snappy compression library and includes # -# SNAPPY_INCLUDE_DIR - where to find snappy.h, etc. -# SNAPPY_LIBRARIES - List of libraries when using snappy. -# SNAPPY_FOUND - True if snappy found. +# snappy_INCLUDE_DIRS - where to find snappy.h, etc. +# snappy_LIBRARIES - List of libraries when using snappy. +# snappy_FOUND - True if snappy found. -find_path(SNAPPY_INCLUDE_DIR +find_path(snappy_INCLUDE_DIRS NAMES snappy.h - HINTS ${SNAPPY_ROOT_DIR}/include) + HINTS ${snappy_ROOT_DIR}/include) find_library(SNAPPY_LIBRARIES NAMES snappy - HINTS ${SNAPPY_ROOT_DIR}/lib) + HINTS ${snappy_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(snappy DEFAULT_MSG SNAPPY_LIBRARIES SNAPPY_INCLUDE_DIR) +find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS) mark_as_advanced( - SNAPPY_LIBRARIES - SNAPPY_INCLUDE_DIR) + snappy_LIBRARIES + snappy_INCLUDE_DIRS) + +if(snappy_FOUND AND NOT (TARGET snappy::snappy)) + add_library (snappy::snappy UNKNOWN IMPORTED) + set_target_properties(snappy::snappy + PROPERTIES + IMPORTED_LOCATION ${snappy_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findzstd.cmake b/cmake/modules/Findzstd.cmake index a2964aa9f80..9430821df6e 100644 --- a/cmake/modules/Findzstd.cmake +++ b/cmake/modules/Findzstd.cmake @@ -1,21 +1,29 @@ # - Find zstd # Find the zstd compression library and includes # -# ZSTD_INCLUDE_DIR - where to find zstd.h, etc. -# ZSTD_LIBRARIES - List of libraries when using zstd. -# ZSTD_FOUND - True if zstd found. +# zstd_INCLUDE_DIRS - where to find zstd.h, etc. +# zstd_LIBRARIES - List of libraries when using zstd. +# zstd_FOUND - True if zstd found. -find_path(ZSTD_INCLUDE_DIR +find_path(zstd_INCLUDE_DIRS NAMES zstd.h - HINTS ${ZSTD_ROOT_DIR}/include) + HINTS ${zstd_ROOT_DIR}/include) -find_library(ZSTD_LIBRARIES +find_library(zstd_LIBRARIES NAMES zstd - HINTS ${ZSTD_ROOT_DIR}/lib) + HINTS ${zstd_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(zstd DEFAULT_MSG ZSTD_LIBRARIES ZSTD_INCLUDE_DIR) +find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS) mark_as_advanced( - ZSTD_LIBRARIES - ZSTD_INCLUDE_DIR) + zstd_LIBRARIES + zstd_INCLUDE_DIRS) + +if(zstd_FOUND AND NOT (TARGET zstd::zstd)) + add_library (zstd::zstd UNKNOWN IMPORTED) + set_target_properties(zstd::zstd + PROPERTIES + IMPORTED_LOCATION ${zstd_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/ReadVersion.cmake b/cmake/modules/ReadVersion.cmake new file mode 100644 index 00000000000..ae356d99659 --- /dev/null +++ b/cmake/modules/ReadVersion.cmake @@ -0,0 +1,10 @@ +# Read rocksdb version from version.h header file. + +function(get_rocksdb_version version_var) + file(READ "${CMAKE_SOURCE_DIR}/include/rocksdb/version.h" version_header_file) + foreach(component MAJOR MINOR PATCH) + string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file}) + set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1}) + endforeach() + set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE) +endfunction() From 4f98b43ba3d5ee469af4429981b01f086bf6102a Mon Sep 17 00:00:00 2001 From: Yun Tang Date: Tue, 6 Aug 2019 09:10:32 -0700 Subject: [PATCH 282/572] Correct the default write buffer size of java doc (#5670) Summary: The actual value of default write buffer size within `rocksdb/include/rocksdb/options.h` is 64 MB, we should correct this value in java doc. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5670 Differential Revision: D16668815 Pulled By: maysamyabandeh fbshipit-source-id: cc3a981c9f1c2cd4a8392b0ed5f1fd0a2d729afb --- .../java/org/rocksdb/MutableColumnFamilyOptionsInterface.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java index 4f4749646f8..4ae96daaf8a 100644 --- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -21,7 +21,7 @@ public interface MutableColumnFamilyOptionsInterface * Also, a larger write buffer will result in a longer recovery time * the next time the database is opened. * - * Default: 4MB + * Default: 64MB * @param writeBufferSize the size of write buffer. * @return the instance of the current object. * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms From d150e01474a0cb281792f51b81260b629b18457f Mon Sep 17 00:00:00 2001 From: Vijay Nadimpalli Date: Tue, 6 Aug 2019 14:22:34 -0700 Subject: [PATCH 283/572] New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf --- CMakeLists.txt | 2 + Makefile | 4 + TARGETS | 8 + appveyor.yml | 2 +- db/compacted_db_impl.cc | 4 +- db/db_blob_index_test.cc | 8 +- db/db_impl/db_impl.cc | 112 +++++--- db/db_impl/db_impl.h | 43 +++- db/db_impl/db_impl_files.cc | 3 +- db/db_merge_operand_test.cc | 240 ++++++++++++++++++ db/db_merge_operator_test.cc | 8 +- db/db_test.cc | 9 + db/db_test2.cc | 8 +- db/memtable.cc | 32 ++- db/memtable.h | 13 +- db/memtable_list.cc | 14 + db/memtable_list.h | 7 + db/version_set.cc | 17 +- db/version_set.h | 30 ++- file/filename.cc | 3 +- include/rocksdb/db.h | 20 ++ include/rocksdb/status.h | 1 + include/rocksdb/utilities/stackable_db.h | 11 + src.mk | 2 + .../block_based/data_block_hash_index_test.cc | 8 +- table/cuckoo/cuckoo_table_reader_test.cc | 12 +- table/get_context.cc | 113 ++++++--- table/get_context.h | 15 +- table/table_reader_bench.cc | 2 +- table/table_test.cc | 16 +- tools/db_bench_tool.cc | 105 +++++++- utilities/blob_db/blob_db_impl.cc | 17 +- utilities/merge_operators.h | 3 + utilities/merge_operators/sortlist.cc | 100 ++++++++ utilities/merge_operators/sortlist.h | 38 +++ utilities/transactions/write_prepared_txn.cc | 8 +- .../transactions/write_prepared_txn_db.cc | 8 +- .../transactions/write_unprepared_txn.cc | 16 +- .../transactions/write_unprepared_txn_db.cc | 8 +- .../write_batch_with_index.cc | 7 +- 40 files changed, 914 insertions(+), 163 deletions(-) create mode 100644 db/db_merge_operand_test.cc create mode 100644 utilities/merge_operators/sortlist.cc create mode 100644 utilities/merge_operators/sortlist.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bb99d1b7ec8..8622242aa75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -661,6 +661,7 @@ set(SOURCES utilities/merge_operators/bytesxor.cc utilities/merge_operators/max.cc utilities/merge_operators/put.cc + utilities/merge_operators/sortlist.cc utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/string_append/stringappend2.cc utilities/merge_operators/uint64add.cc @@ -887,6 +888,7 @@ if(WITH_TESTS) db/db_log_iter_test.cc db/db_memtable_test.cc db/db_merge_operator_test.cc + db/db_merge_operand_test.cc db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc diff --git a/Makefile b/Makefile index 4502be8e46b..1718309cb89 100644 --- a/Makefile +++ b/Makefile @@ -454,6 +454,7 @@ TESTS = \ db_iterator_test \ db_memtable_test \ db_merge_operator_test \ + db_merge_operand_test \ db_options_test \ db_range_del_test \ db_secondary_test \ @@ -1254,6 +1255,9 @@ db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHA db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 25d7ff66759..bac5c4311aa 100644 --- a/TARGETS +++ b/TARGETS @@ -301,6 +301,7 @@ cpp_library( "utilities/merge_operators/bytesxor.cc", "utilities/merge_operators/max.cc", "utilities/merge_operators/put.cc", + "utilities/merge_operators/sortlist.cc", "utilities/merge_operators/string_append/stringappend.cc", "utilities/merge_operators/string_append/stringappend2.cc", "utilities/merge_operators/uint64add.cc", @@ -755,6 +756,13 @@ ROCKS_TESTS = [ [], [], ], + [ + "db_merge_operand_test", + "db/db_merge_operand_test.cc", + "parallel", + [], + [], + ], [ "db_options_test", "db/db_options_test.cc", diff --git a/appveyor.yml b/appveyor.yml index 6bdb164e84e..77901c40724 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -60,7 +60,7 @@ build: test: test_script: - - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8 + - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8 on_failure: - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc index 88928391ad2..13cccbd7746 100644 --- a/db/compacted_db_impl.cc +++ b/db/compacted_db_impl.cc @@ -37,7 +37,7 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, const Slice& key, PinnableSlice* value) { GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, key, value, nullptr, nullptr, - nullptr, nullptr); + true, nullptr, nullptr); LookupKey lkey(key, kMaxSequenceNumber); files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(), &get_context, nullptr); @@ -70,7 +70,7 @@ std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, std::string& value = (*values)[idx]; GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, keys[idx], &pinnable_val, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); LookupKey lkey(keys[idx], kMaxSequenceNumber); r->Get(options, lkey.internal_key(), &get_context, nullptr); value.assign(pinnable_val.data(), pinnable_val.size()); diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc index 005a23d63b7..e9618885a2d 100644 --- a/db/db_blob_index_test.cc +++ b/db/db_blob_index_test.cc @@ -63,9 +63,11 @@ class DBBlobIndexTest : public DBTestBase { ReadOptions read_options; read_options.snapshot = snapshot; PinnableSlice value; - auto s = dbfull()->GetImpl(read_options, cfh(), key, &value, - nullptr /*value_found*/, nullptr /*callback*/, - is_blob_index); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); if (s.IsNotFound()) { return "NOT_FOUND"; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 81c44388bcf..9236d911e78 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1441,19 +1441,22 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + return GetImpl(read_options, key, get_impl_options); } -Status DBImpl::GetImpl(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* pinnable_val, bool* value_found, - ReadCallback* callback, bool* is_blob_index) { - assert(pinnable_val != nullptr); +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr); PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); - auto cfh = reinterpret_cast(column_family); + auto cfh = + reinterpret_cast(get_impl_options.column_family); auto cfd = cfh->cfd(); if (tracer_) { @@ -1461,7 +1464,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, // tracing is enabled. InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - tracer_->Get(column_family, key); + tracer_->Get(get_impl_options.column_family, key); } } @@ -1473,9 +1476,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, SequenceNumber snapshot; if (read_options.snapshot != nullptr) { - if (callback) { + if (get_impl_options.callback) { // Already calculated based on read_options.snapshot - snapshot = callback->max_visible_seq(); + snapshot = get_impl_options.callback->max_visible_seq(); } else { snapshot = reinterpret_cast(read_options.snapshot)->number_; @@ -1489,12 +1492,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, snapshot = last_seq_same_as_publish_seq_ ? versions_->LastSequence() : versions_->LastPublishedSequence(); - if (callback) { + if (get_impl_options.callback) { // The unprep_seqs are not published for write unprepared, so it could be // that max_visible_seq is larger. Seek to the std::max of the two. // However, we still want our callback to contain the actual snapshot so // that it can do the correct visibility filtering. - callback->Refresh(snapshot); + get_impl_options.callback->Refresh(snapshot); // Internally, WriteUnpreparedTxnReadCallback::Refresh would set // max_visible_seq = max(max_visible_seq, snapshot) @@ -1505,7 +1508,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, // be needed. // // assert(callback->max_visible_seq() >= snapshot); - snapshot = callback->max_visible_seq(); + snapshot = get_impl_options.callback->max_visible_seq(); } } TEST_SYNC_POINT("DBImpl::GetImpl:3"); @@ -1526,19 +1529,39 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, has_unpersisted_data_.load(std::memory_order_relaxed)); bool done = false; if (!skip_memtable) { - if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &max_covering_tombstone_seq, read_options, callback, - is_blob_index)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } else if ((s.ok() || s.IsMergeInProgress()) && - sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &max_covering_tombstone_seq, read_options, callback, - is_blob_index)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, nullptr, + nullptr, false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } } if (!done && !s.ok() && !s.IsMergeInProgress()) { ReturnAndCleanupSuperVersion(cfd, sv); @@ -1547,9 +1570,14 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, } if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); - sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context, - &max_covering_tombstone_seq, value_found, nullptr, nullptr, - callback, is_blob_index); + sv->current->Get( + read_options, lkey, get_impl_options.value, &s, &merge_context, + &max_covering_tombstone_seq, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); RecordTick(stats_, MEMTABLE_MISS); } @@ -1561,7 +1589,25 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, RecordTick(stats_, NUMBER_KEYS_READ); size_t size = 0; if (s.ok()) { - size = pinnable_val->size(); + if (get_impl_options.get_value) { + size = get_impl_options.value->size(); + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } RecordTick(stats_, BYTES_READ, size); PERF_COUNTER_ADD(get_read_bytes, size); } @@ -2222,7 +2268,11 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; - auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + auto s = GetImpl(roptions, key, get_impl_options); value->assign(pinnable_val.data(), pinnable_val.size()); // If block_cache is enabled and the index block of the table didn't diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index fe3a2f6f20f..f1dbc5d0286 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -159,6 +159,21 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } + using DB::MultiGet; virtual std::vector MultiGet( const ReadOptions& options, @@ -395,12 +410,32 @@ class DBImpl : public DB { // ---- End of implementations of the DB interface ---- + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here - Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - bool* value_found = nullptr, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions get_impl_options); ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, ColumnFamilyData* cfd, diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index e3b2f576523..3c5fd4fcd7f 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -318,8 +318,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // We may ignore the dbname when generating the file names. for (auto& file : state.sst_delete_files) { candidate_files.emplace_back( - MakeTableFileName(file.metadata->fd.GetNumber()), - file.path); + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); if (file.metadata->table_reader_handle) { table_cache_->Release(file.metadata->table_reader_handle); } diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc new file mode 100644 index 00000000000..e6280ad8c79 --- /dev/null +++ b/db/db_merge_operand_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_builder.h" +#include "test_util/fault_injection_test_env.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace rocksdb { + +class DBMergeOperandTest : public DBTestBase { + public: + DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {} +}; + +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { + class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; + }; + + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // k0 value in memtable + Put("k0", "PutARock"); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "PutARock"); + + // k0.1 value in SST + Put("k0.1", "RockInSST"); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "RockInSST"); + + // All k1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + Put("k1", "x"); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1.1 values are in memtable. + ASSERT_OK(Merge("k1.1", "r")); + Delete("k1.1"); + ASSERT_OK(Merge("k1.1", "c")); + ASSERT_OK(Merge("k1.1", "k")); + ASSERT_OK(Merge("k1.1", "s")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "c"); + ASSERT_EQ(values[1], "k"); + ASSERT_EQ(values[2], "s"); + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "q"); + ASSERT_EQ(values[1], "w"); + ASSERT_EQ(values[2], "e"); + ASSERT_EQ(values[3], "r"); + + // All k2.1 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.1", "m")); + Put("k2.1", "l"); + ASSERT_OK(Merge("k2.1", "n")); + ASSERT_OK(Merge("k2.1", "o")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "l,n,o"); + + // All k2.2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.2", "g")); + Delete("k2.2"); + ASSERT_OK(Merge("k2.2", "o")); + ASSERT_OK(Merge("k2.2", "t")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "o,t"); + + // Do some compaction that will make the following tests more predictable + // Slice start("PutARock"); + // Slice end("t"); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All k3.1 values are flushed and are in different files. + ASSERT_OK(Merge("k3.1", "ab")); + ASSERT_OK(Flush()); + Put("k3.1", "bc"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "bc"); + ASSERT_EQ(values[1], "cd"); + ASSERT_EQ(values[2], "de"); + + // All k3.2 values are flushed and are in different files. + ASSERT_OK(Merge("k3.2", "ab")); + ASSERT_OK(Flush()); + Delete("k3.2"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "cd"); + ASSERT_EQ(values[1], "de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); + + // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable + ASSERT_OK(Merge("k5", "who")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Flush()); + Put("k5", "remember"); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "rocks")); + dbfull()->TEST_SwitchMemtable(); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "remember"); + ASSERT_EQ(values[1], "i"); + ASSERT_EQ(values[2], "am"); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 31bd2e491b1..8358ddb56c2 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -46,9 +46,11 @@ class DBMergeOperatorTest : public DBTestBase { ReadOptions read_opt; read_opt.snapshot = snapshot; PinnableSlice value; - Status s = - dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value, - nullptr /*value_found*/, &read_callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = db_->DefaultColumnFamily(); + get_impl_options.value = &value; + get_impl_options.callback = &read_callback; + Status s = dbfull()->GetImpl(read_opt, key, get_impl_options); if (!s.ok()) { return s.ToString(); } diff --git a/db/db_test.cc b/db/db_test.cc index f53afa17d9d..5c96bec36c5 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2540,6 +2540,15 @@ class ModelDB : public DB { return Status::NotSupported(key); } + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, + const Slice& key, PinnableSlice* /*slice*/, + GetMergeOperandsOptions* /*merge_operands_options*/, + int* /*number_of_operands*/) override { + return Status::NotSupported(key); + } + using DB::MultiGet; std::vector MultiGet( const ReadOptions& /*options*/, diff --git a/db/db_test2.cc b/db/db_test2.cc index 3664b3a249f..26604c53ad8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2797,8 +2797,12 @@ TEST_F(DBTest2, ReadCallbackTest) { ReadOptions roptions; TestReadCallback callback(seq); bool dont_care = true; - Status s = dbfull()->GetImpl(roptions, dbfull()->DefaultColumnFamily(), key, - &pinnable_val, &dont_care, &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = dbfull()->DefaultColumnFamily(); + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = &dont_care; + get_impl_options.callback = &callback; + Status s = dbfull()->GetImpl(roptions, key, get_impl_options); ASSERT_TRUE(s.ok()); // Assuming that after each Put the DB increased seq by one, the value and // seq number must be equal since we also inc value by 1 after each Put. diff --git a/db/memtable.cc b/db/memtable.cc index fdd1a577ade..62c7339b5d0 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -601,6 +601,7 @@ struct Saver { Logger* logger; Statistics* statistics; bool inplace_update_support; + bool do_merge; Env* env_; ReadCallback* callback_; bool* is_blob_index; @@ -627,7 +628,7 @@ static bool SaveValue(void* arg, const char* entry) { // klength varint32 // userkey char[klength-8] // tag uint64 - // vlength varint32 + // vlength varint32f // value char[vlength] // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped @@ -677,12 +678,24 @@ static bool SaveValue(void* arg, const char* entry) { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->status) = Status::OK(); if (*(s->merge_in_progress)) { - if (s->value != nullptr) { - *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), &v, - merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + if (s->do_merge) { + if (s->value != nullptr) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->env_, nullptr /* result_operand */, true); + } + } else { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); } + } else if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); } else if (s->value != nullptr) { s->value->assign(v.data(), v.size()); } @@ -726,7 +739,8 @@ static bool SaveValue(void* arg, const char* entry) { *(s->merge_in_progress) = true; merge_context->PushOperand( v, s->inplace_update_support == false /* operand_pinned */); - if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) { + if (s->do_merge && merge_operator->ShouldMerge( + merge_context->GetOperandsDirectionBackward())) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, s->statistics, @@ -750,7 +764,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, - ReadCallback* callback, bool* is_blob_index) { + ReadCallback* callback, bool* is_blob_index, bool do_merge) { // The sequence number is updated synchronously in version_set.h if (IsEmpty()) { // Avoiding recording stats for speed. @@ -810,8 +824,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, saver.env_ = env_; saver.callback_ = callback; saver.is_blob_index = is_blob_index; + saver.do_merge = do_merge; table_->Get(key, &saver, SaveValue); - *seq = saver.seq; } diff --git a/db/memtable.h b/db/memtable.h index 6b8c4141f5a..36ba0df79ba 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -175,6 +175,10 @@ class MemTable { const Slice& value, bool allow_concurrent = false, MemTablePostProcessInfo* post_process_info = nullptr); + // Used to Get value associated with key or Get Merge Operands associated + // with key. + // If do_merge = true the default behavior which is Get value for key is + // executed. Expected behavior is described right below. // If memtable contains a value for key, store it in *value and return true. // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. @@ -188,20 +192,23 @@ class MemTable { // returned). Otherwise, *seq will be set to kMaxSequenceNumber. // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other // status returned indicates a corruption or other unexpected error. + // If do_merge = false then any Merge Operands encountered for key are simply + // stored in merge_context.operands_list and never actually merged to get a + // final value. The raw Merge Operands are eventually returned to the user. bool Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); + bool* is_blob_index = nullptr, bool do_merge = true); bool Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr) { + bool* is_blob_index = nullptr, bool do_merge = true) { SequenceNumber seq; return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, - read_opts, callback, is_blob_index); + read_opts, callback, is_blob_index, do_merge); } // Attempts to update the new_value inplace, else does normal Add diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 0f796eb9a73..d06a82df8ef 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -109,6 +109,20 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, is_blob_index); } +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get(key, nullptr, s, merge_context, + max_covering_tombstone_seq, read_opts, nullptr, + nullptr, false); + if (done) { + return true; + } + } + return false; +} + bool MemTableListVersion::GetFromHistory( const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, diff --git a/db/memtable_list.h b/db/memtable_list.h index a72077ff3d5..2bd225b8390 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -71,6 +71,13 @@ class MemTableListVersion { read_opts, callback, is_blob_index); } + // Returns all the merge operands corresponding to the key by searching all + // memtables starting from the most recent one. + bool GetMergeOperands(const LookupKey& key, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts); + // Similar to Get(), but searches the Memtable history of memtables that // have already been flushed. Should only be used from in-memory only // queries (such as Transaction validation) as the history may contain diff --git a/db/version_set.cc b/db/version_set.cc index 3a1f47790c5..af0168f7660 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1651,7 +1651,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, - bool* is_blob) { + bool* is_blob, bool do_merge) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); @@ -1671,8 +1671,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, - value, value_found, merge_context, max_covering_tombstone_seq, this->env_, - seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + do_merge ? value : nullptr, value_found, merge_context, do_merge, + max_covering_tombstone_seq, this->env_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, tracing_get_id); // Pin blocks that we read to hold merge operands @@ -1737,7 +1738,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } - PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); return; case GetContext::kDeleted: // Use empty error message for speed @@ -1755,11 +1757,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } f = fp.GetNextFile(); } - if (db_statistics_ != nullptr) { get_context.ReportCounters(); } if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } if (!merge_operator_) { *status = Status::InvalidArgument( "merge_operator is not properly initialized."); @@ -1806,7 +1811,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, get_ctx.emplace_back( user_comparator(), merge_operator_, info_log_, db_statistics_, iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, - iter->value, nullptr, &(iter->merge_context), + iter->value, nullptr, &(iter->merge_context), true, &iter->max_covering_tombstone_seq, this->env_, &iter->seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, tracing_mget_id); diff --git a/db/version_set.h b/db/version_set.h index 391bb902c4b..25598630e2a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -63,7 +63,6 @@ class VersionSet; class WriteBufferManager; class MergeContext; class ColumnFamilySet; -class TableCache; class MergeIteratorBuilder; // Return the smallest index i such that file_level.files[i]->largest >= key. @@ -561,28 +560,33 @@ class Version { const Slice& largest_user_key, int level, bool* overlap); - // Lookup the value for key. If found, store it in *val and - // return OK. Else return a non-OK status. - // Uses *operands to store merge_operator operations to apply later. + // Lookup the value for key or get all merge operands for key. + // If do_merge = true (default) then lookup value for key. + // Behavior if do_merge = true: + // If found, store it in *value and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later. // - // If the ReadOptions.read_tier is set to do a read-only fetch, then - // *value_found will be set to false if it cannot be determined whether - // this value exists without doing IO. + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. // - // If the key is Deleted, *status will be set to NotFound and + // If the key is Deleted, *status will be set to NotFound and // *key_exists will be set to true. - // If no key was found, *status will be set to NotFound and + // If no key was found, *status will be set to NotFound and // *key_exists will be set to false. - // If seq is non-null, *seq will be set to the sequence number found - // for the key if a key was found. - // + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // Behavior if do_merge = false + // If the key has any merge operands then store them in + // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, bool* value_found = nullptr, bool* key_exists = nullptr, SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, - bool* is_blob = nullptr); + bool* is_blob = nullptr, bool do_merge = true); void MultiGet(const ReadOptions&, MultiGetRange* range, ReadCallback* callback = nullptr, bool* is_blob = nullptr); diff --git a/file/filename.cc b/file/filename.cc index 65ec3314995..ba5d84c291f 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -60,8 +60,7 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { static std::string MakeFileName(uint64_t number, const char* suffix) { char buf[100]; snprintf(buf, sizeof(buf), "%06llu.%s", - static_cast(number), - suffix); + static_cast(number), suffix); return buf; } diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 1d90dc50b4b..36d6fea92bb 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -116,6 +116,10 @@ struct IngestExternalFileArg { IngestExternalFileOptions options; }; +struct GetMergeOperandsOptions { + int expected_max_number_of_operands = 0; +}; + // A collections of table properties objects, where // key: is the table's file name. // value: the table properties object of the given table. @@ -403,6 +407,22 @@ class DB { return Get(options, DefaultColumnFamily(), key, value); } + // Returns all the merge operands corresponding to the key. If the + // number of merge operands in DB is greater than + // merge_operands_options.expected_max_number_of_operands + // no merge operands are returned and status is Incomplete. Merge operands + // returned are in the order of insertion. + // merge_operands- Points to an array of at-least + // merge_operands_options.expected_max_number_of_operands and the + // caller is responsible for allocating it. If the status + // returned is Incomplete then number_of_operands will contain + // the total number of merge operands found in DB for key. + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) = 0; + // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index ac97ce442af..e4360126dbd 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -76,6 +76,7 @@ class Status { kMemoryLimit = 7, kSpaceLimit = 8, kPathNotFound = 9, + KMergeOperandsInsufficientCapacity = 10, kMaxSubCode }; diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 67bf4e2fa6b..35fddc804b9 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -88,6 +88,17 @@ class StackableDB : public DB { return db_->Get(options, column_family, key, value); } + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + return db_->GetMergeOperands(options, column_family, key, slice, + get_merge_operands_options, + number_of_operands); + } + using DB::MultiGet; virtual std::vector MultiGet( const ReadOptions& options, diff --git a/src.mk b/src.mk index 0c6142e41ad..6d1d655c7f0 100644 --- a/src.mk +++ b/src.mk @@ -191,6 +191,7 @@ LIB_SOURCES = \ utilities/memory/memory_util.cc \ utilities/merge_operators/max.cc \ utilities/merge_operators/put.cc \ + utilities/merge_operators/sortlist.cc \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/uint64add.cc \ @@ -291,6 +292,7 @@ MAIN_SOURCES = \ db/db_log_iter_test.cc \ db/db_memtable_test.cc \ db/db_merge_operator_test.cc \ + db/db_merge_operand_test.cc \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 484617d7e14..ae23f6ef2d3 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -631,7 +631,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 60, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -656,7 +656,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 60, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -681,7 +681,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 120, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -706,7 +706,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 5, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kNotFound); diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index dd1557db147..8043d36ab8e 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -122,7 +122,7 @@ class CuckooReaderTest : public testing::Test { PinnableSlice value; GetContext get_context(ucomp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(user_keys[i]), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr)); ASSERT_STREQ(values[i].c_str(), value.data()); @@ -336,7 +336,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { AppendInternalKey(¬_found_key, ikey); PinnableSlice value; GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, - Slice(not_found_key), &value, nullptr, nullptr, + Slice(not_found_key), &value, nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr)); @@ -351,7 +351,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { value.Reset(); GetContext get_context2(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(not_found_key2), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr)); ASSERT_TRUE(value.empty()); @@ -367,7 +367,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { value.Reset(); GetContext get_context3(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr)); ASSERT_TRUE(value.empty()); @@ -443,7 +443,7 @@ void WriteFile(const std::vector& keys, // Assume only the fast path is triggered GetContext get_context(nullptr, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); for (uint64_t i = 0; i < num; ++i) { value.Reset(); value.clear(); @@ -491,7 +491,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { // Assume only the fast path is triggered GetContext get_context(nullptr, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { diff --git a/table/get_context.cc b/table/get_context.cc index f0c7928bf42..cdb5798f782 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -42,9 +42,9 @@ GetContext::GetContext( const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, - SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq, - PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, - bool* is_blob_index, uint64_t tracing_get_id) + bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env, + SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -60,6 +60,7 @@ GetContext::GetContext( replay_log_(nullptr), pinned_iters_mgr_(_pinned_iters_mgr), callback_(callback), + do_merge_(do_merge), is_blob_index_(is_blob_index), tracing_get_id_(tracing_get_id) { if (seq_) { @@ -215,29 +216,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } if (kNotFound == state_) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - if (LIKELY(value_pinner != nullptr)) { - // If the backing resources for the value are provided, pin them - pinnable_val_->PinSlice(value, value_pinner); - } else { - TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); + if (do_merge_) { + if (LIKELY(pinnable_val_ != nullptr)) { + if (LIKELY(value_pinner != nullptr)) { + // If the backing resources for the value are provided, pin them + pinnable_val_->PinSlice(value, value_pinner); + } else { + TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", + this); - // Otherwise copy the value - pinnable_val_->PinSelf(value); + // Otherwise copy the value + pinnable_val_->PinSelf(value); + } } + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, &value, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + if (do_merge_) { + if (LIKELY(pinnable_val_ != nullptr)) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, &value, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); } } if (is_blob_index_ != nullptr) { @@ -256,14 +272,18 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } else if (kMerge == state_) { state_ = kFound; if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } + // If do_merge_ = false then the current value shouldn't be part of + // merge_context_->operand_list } } return false; @@ -272,24 +292,23 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(state_ == kNotFound || state_ == kMerge); state_ = kMerge; // value_pinner is not set from plain_table_reader.cc for example. - if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && - value_pinner != nullptr) { - value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); - merge_context_->PushOperand(value, true /*value_pinned*/); - } else { - merge_context_->PushOperand(value, false); - } - if (merge_operator_ != nullptr && - merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) { + push_operand(value, value_pinner); + if (do_merge_ && merge_operator_ != nullptr && + merge_operator_->ShouldMerge( + merge_context_->GetOperandsDirectionBackward())) { state_ = kFound; if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + // do_merge_ = true this is the case where this function is called + // as part of DB Get API hence merge operators should be merged. + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } } return false; @@ -306,6 +325,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, return false; } +void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { + if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && + value_pinner != nullptr) { + value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); + merge_context_->PushOperand(value, true /*value_pinned*/); + } else { + merge_context_->PushOperand(value, false); + } +} + void replayGetContextLog(const Slice& replay_log, const Slice& user_key, GetContext* get_context, Cleanable* value_pinner) { #ifndef ROCKSDB_LITE diff --git a/table/get_context.h b/table/get_context.h index 7110ceae806..97d73ec0b3a 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -66,6 +66,9 @@ class GetContext { GetContextStats get_context_stats_; // Constructor + // @param value Holds the value corresponding to user_key. If its nullptr + // then return all merge operands corresponding to user_key + // via merge_context // @param value_found If non-nullptr, set to false if key may be present // but we can't be certain because we cannot do IO // @param max_covering_tombstone_seq Pointer to highest sequence number of @@ -78,10 +81,14 @@ class GetContext { // for visibility of a key // @param is_blob_index If non-nullptr, will be used to indicate if a found // key is of type blob index + // @param do_merge True if value associated with user_key has to be returned + // and false if all the merge operands associated with user_key has to be + // returned. Id do_merge=false then all the merge operands are stored in + // merge_context and they are never merged. The value pointer is untouched. GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* value, bool* value_found, - MergeContext* merge_context, + MergeContext* merge_context, bool do_merge, SequenceNumber* max_covering_tombstone_seq, Env* env, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, @@ -140,6 +147,8 @@ class GetContext { uint64_t get_tracing_get_id() const { return tracing_get_id_; } + void push_operand(const Slice& value, Cleanable* value_pinner); + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -162,6 +171,10 @@ class GetContext { PinnedIteratorsManager* pinned_iters_mgr_; ReadCallback* callback_; bool sample_; + // Value is true if it's called as part of DB Get API and false if it's + // called as part of DB GetMergeOperands API. When it's false merge operators + // are never merged. + bool do_merge_; bool* is_blob_index_; // Used for block cache tracing only. A tracing get id uniquely identifies a // Get or a MultiGet. diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index cec62df5949..45d760f0ef8 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -175,7 +175,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, ioptions.merge_operator, ioptions.info_log, ioptions.statistics, GetContext::kNotFound, Slice(key), &value, nullptr, &merge_context, - &max_covering_tombstone_seq, env); + true, &max_covering_tombstone_seq, env); s = table_reader->Get(read_options, key, &get_context, nullptr); } else { s = db->Get(read_options, key, &result); diff --git a/table/table_test.cc b/table/table_test.cc index 6cd26bc732a..749048b78c2 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2323,8 +2323,8 @@ TEST_P(BlockBasedTableTest, TracingGetTest) { PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, /*get_id=*/i); + nullptr, true, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, /*tracing_get_id=*/i); get_perf_context()->Reset(); ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context, moptions.prefix_extractor.get())); @@ -2579,7 +2579,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { { GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), nullptr, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. reader->Get(ReadOptions(), "non-exist-key", &get_context, moptions.prefix_extractor.get()); @@ -2750,7 +2750,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context, moptions4.prefix_extractor.get())); ASSERT_STREQ(value.data(), "hello"); @@ -2836,7 +2836,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { { GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); get_perf_context()->Reset(); ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, moptions.prefix_extractor.get())); @@ -2862,7 +2862,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { { GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); get_perf_context()->Reset(); ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, moptions.prefix_extractor.get())); @@ -4230,7 +4230,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { std::string user_key = ExtractUserKey(kv.first).ToString(); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ro, kv.first, &get_context, moptions.prefix_extractor.get())); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -4256,7 +4256,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ro, encoded_key, &get_context, moptions.prefix_extractor.get())); ASSERT_EQ(get_context.State(), GetContext::kNotFound); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index f6a9d945897..001dd4d2fb0 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -71,6 +71,7 @@ #include "utilities/blob_db/blob_db.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/sortlist.h" #include "utilities/persistent_cache/block_cache_tier.h" #ifdef OS_WIN @@ -120,7 +121,8 @@ DEFINE_string( "fillseekseq," "randomtransaction," "randomreplacekeys," - "timeseries", + "timeseries," + "getmergeoperands", "Comma-separated list of operations to run in the specified" " order. Available benchmarks:\n" @@ -190,7 +192,13 @@ DEFINE_string( "\tlevelstats -- Print the number of files and bytes per level\n" "\tsstables -- Print sstable info\n" "\theapprofile -- Dump a heap profile (if supported by this port)\n" - "\treplay -- replay the trace file specified with trace_file\n"); + "\treplay -- replay the trace file specified with trace_file\n" + "\tgetmergeoperands -- Insert lots of merge records which are a list of " + "sorted ints for a key and then compare performance of lookup for another " + "key " + "by doing a Get followed by binary searching in the large sorted list vs " + "doing a GetMergeOperands and binary searching in the operands which are" + "sorted sub-lists. The MergeOperator used is sortlist.h\n"); DEFINE_int64(num, 1000000, "Number of key/values to place in database"); @@ -2880,6 +2888,8 @@ class Benchmark { exit(1); } method = &Benchmark::Replay; + } else if (name == "getmergeoperands") { + method = &Benchmark::GetMergeOperands; } else if (!name.empty()) { // No error message for empty name fprintf(stderr, "unknown benchmark '%s'\n", name.c_str()); exit(1); @@ -5921,6 +5931,97 @@ class Benchmark { } } + bool binary_search(std::vector& data, int start, int end, int key) { + if (data.empty()) return false; + if (start > end) return false; + int mid = start + (end - start) / 2; + if (mid > static_cast(data.size()) - 1) return false; + if (data[mid] == key) { + return true; + } else if (data[mid] > key) { + return binary_search(data, start, mid - 1, key); + } else { + return binary_search(data, mid + 1, end, key); + } + } + + // Does a bunch of merge operations for a key(key1) where the merge operand + // is a sorted list. Next performance comparison is done between doing a Get + // for key1 followed by searching for another key(key2) in the large sorted + // list vs calling GetMergeOperands for key1 and then searching for the key2 + // in all the sorted sub-lists. Later case is expected to be a lot faster. + void GetMergeOperands(ThreadState* thread) { + DB* db = SelectDB(thread); + const int kTotalValues = 100000; + const int kListSize = 100; + std::string key = "my_key"; + std::string value; + + for (int i = 1; i < kTotalValues; i++) { + if (i % kListSize == 0) { + // Remove trailing ',' + value.pop_back(); + db->Merge(WriteOptions(), key, value); + value.clear(); + } else { + value.append(std::to_string(i)).append(","); + } + } + + SortList s; + std::vector data; + // This value can be experimented with and it will demonstrate the + // perf difference between doing a Get and searching for lookup_key in the + // resultant large sorted list vs doing GetMergeOperands and searching + // for lookup_key within this resultant sorted sub-lists. + int lookup_key = 1; + + // Get API call + std::cout << "--- Get API call --- \n"; + PinnableSlice p_slice; + uint64_t st = FLAGS_env->NowNanos(); + db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice); + s.MakeVector(data, p_slice); + bool found = + binary_search(data, 0, static_cast(data.size() - 1), lookup_key); + std::cout << "Found key? " << std::to_string(found) << "\n"; + uint64_t sp = FLAGS_env->NowNanos(); + std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n"; + std::string* dat_ = p_slice.GetSelf(); + std::cout << "Sample data from Get API call: " << dat_->substr(0, 10) + << "\n"; + data.clear(); + + // GetMergeOperands API call + std::cout << "--- GetMergeOperands API --- \n"; + std::vector a_slice((kTotalValues / kListSize) + 1); + st = FLAGS_env->NowNanos(); + int number_of_operands = 0; + GetMergeOperandsOptions get_merge_operands_options; + get_merge_operands_options.expected_max_number_of_operands = + (kTotalValues / 100) + 1; + db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key, + a_slice.data(), &get_merge_operands_options, + &number_of_operands); + for (PinnableSlice& psl : a_slice) { + s.MakeVector(data, psl); + found = + binary_search(data, 0, static_cast(data.size() - 1), lookup_key); + data.clear(); + if (found) break; + } + std::cout << "Found key? " << std::to_string(found) << "\n"; + sp = FLAGS_env->NowNanos(); + std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0 + << " seconds \n"; + int to_print = 0; + std::cout << "Sample data from GetMergeOperands API call: "; + for (PinnableSlice& psl : a_slice) { + std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n"; + if (to_print++ > 2) break; + } + } + #ifndef ROCKSDB_LITE // This benchmark stress tests Transactions. For a given --duration (or // total number of --writes, a Transaction will perform a read-modify-write diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index caa9b098804..86501280d22 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1146,9 +1146,11 @@ Status BlobDBImpl::GetImpl(const ReadOptions& read_options, PinnableSlice index_entry; Status s; bool is_blob_index = false; - s = db_impl_->GetImpl(ro, column_family, key, &index_entry, - nullptr /*value_found*/, nullptr /*read_callback*/, - &is_blob_index); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &index_entry; + get_impl_options.is_blob_index = &is_blob_index; + s = db_impl_->GetImpl(ro, key, get_impl_options); TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1"); TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2"); if (expiration != nullptr) { @@ -1535,9 +1537,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, SequenceNumber latest_seq = GetLatestSequenceNumber(); bool is_blob_index = false; PinnableSlice index_entry; - Status get_status = db_impl_->GetImpl( - ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/, - nullptr /*read_callback*/, &is_blob_index); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh; + get_impl_options.value = &index_entry; + get_impl_options.is_blob_index = &is_blob_index; + Status get_status = + db_impl_->GetImpl(ReadOptions(), record.key, get_impl_options); TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB"); if (!get_status.ok() && !get_status.IsNotFound()) { // error diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h index 4c720b822fe..6e3464bdfb4 100644 --- a/utilities/merge_operators.h +++ b/utilities/merge_operators.h @@ -23,6 +23,7 @@ class MergeOperators { static std::shared_ptr CreateStringAppendTESTOperator(); static std::shared_ptr CreateMaxOperator(); static std::shared_ptr CreateBytesXOROperator(); + static std::shared_ptr CreateSortOperator(); // Will return a different merge operator depending on the string. // TODO: Hook the "name" up to the actual Name() of the MergeOperators? @@ -42,6 +43,8 @@ class MergeOperators { return CreateMaxOperator(); } else if (name == "bytesxor") { return CreateBytesXOROperator(); + } else if (name == "sortlist") { + return CreateSortOperator(); } else { // Empty or unknown, just return nullptr return nullptr; diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc new file mode 100644 index 00000000000..5dbf051157e --- /dev/null +++ b/utilities/merge_operators/sortlist.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/sortlist.h" + +using rocksdb::Logger; +using rocksdb::MergeOperator; +using rocksdb::Slice; + +namespace rocksdb { + +bool SortList::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + std::vector left; + for (Slice slice : merge_in.operand_list) { + std::vector right; + MakeVector(right, slice); + left = Merge(left, right); + } + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + merge_out->new_value.append(std::to_string(left[i])).append(","); + } + merge_out->new_value.append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const { + std::vector left; + std::vector right; + MakeVector(left, left_operand); + MakeVector(right, right_operand); + left = Merge(left, right); + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + new_value->append(std::to_string(left[i])).append(","); + } + new_value->append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + (void)operand_list; + (void)new_value; + return true; +} + +const char* SortList::Name() const { return "MergeSortOperator"; } + +void SortList::MakeVector(std::vector& operand, Slice slice) const { + do { + const char* begin = slice.data_; + while (*slice.data_ != ',' && *slice.data_) slice.data_++; + operand.push_back(std::stoi(std::string(begin, slice.data_))); + } while (0 != *slice.data_++); +} + +std::vector SortList::Merge(std::vector& left, + std::vector& right) const { + // Fill the resultant vector with sorted results from both vectors + std::vector result; + unsigned left_it = 0, right_it = 0; + + while (left_it < left.size() && right_it < right.size()) { + // If the left value is smaller than the right it goes next + // into the resultant vector + if (left[left_it] < right[right_it]) { + result.push_back(left[left_it]); + left_it++; + } else { + result.push_back(right[right_it]); + right_it++; + } + } + + // Push the remaining data from both vectors onto the resultant + while (left_it < left.size()) { + result.push_back(left[left_it]); + left_it++; + } + + while (right_it < right.size()) { + result.push_back(right[right_it]); + right_it++; + } + + return result; +} + +std::shared_ptr MergeOperators::CreateSortOperator() { + return std::make_shared(); +} +} // namespace rocksdb diff --git a/utilities/merge_operators/sortlist.h b/utilities/merge_operators/sortlist.h new file mode 100644 index 00000000000..02c93edf5e9 --- /dev/null +++ b/utilities/merge_operators/sortlist.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// A MergeOperator for RocksDB that implements Merge Sort. +// It is built using the MergeOperator interface. The operator works by taking +// an input which contains one or more merge operands where each operand is a +// list of sorted ints and merges them to form a large sorted list. +#pragma once + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class SortList : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + const char* Name() const override; + + void MakeVector(std::vector& operand, Slice slice) const; + + private: + std::vector Merge(std::vector& left, std::vector& right) const; +}; + +} // namespace rocksdb diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 188f61120be..8dfc0d1d4ac 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -290,8 +290,12 @@ Status WritePreparedTxn::RollbackInternal() { PinnableSlice pinnable_val; bool not_used; auto cf_handle = handles_[cf]; - s = db_->GetImpl(roptions_, cf_handle, key, &pinnable_val, ¬_used, - &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_->GetImpl(roptions_, key, get_impl_options); assert(s.ok() || s.IsNotFound()); if (s.ok()) { s = rollback_batch_->Put(cf_handle, key, pinnable_val); diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index e6d71020685..7441cb3c093 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -231,8 +231,12 @@ Status WritePreparedTxnDB::Get(const ReadOptions& options, WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted, backed_by_snapshot); bool* dont_care = nullptr; - auto res = db_impl_->GetImpl(options, column_family, key, value, dont_care, - &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.value_found = dont_care; + get_impl_options.callback = &callback; + auto res = db_impl_->GetImpl(options, key, get_impl_options); if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(), backed_by_snapshot))) { return res; diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index a1862d32d44..321110ea1b6 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -567,8 +567,12 @@ Status WriteUnpreparedTxn::RollbackInternal() { const auto& cf_handle = cf_map.at(cfid); PinnableSlice pinnable_val; bool not_used; - s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used, - &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_impl_->GetImpl(roptions, key, get_impl_options); if (s.ok()) { s = rollback_batch.Put(cf_handle, key, pinnable_val); @@ -721,8 +725,12 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() { const auto& cf_handle = cf_map.at(cfid); PinnableSlice pinnable_val; bool not_used; - s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used, - &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_impl_->GetImpl(roptions, key, get_impl_options); if (s.ok()) { s = write_batch_.Put(cf_handle, key, pinnable_val); diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index defaf9fce6e..3a8eff5ec5e 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -86,8 +86,12 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( PinnableSlice pinnable_val; bool not_used; auto cf_handle = handles_[cf]; - s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used, - &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_->GetImpl(roptions, key, get_impl_options); assert(s.ok() || s.IsNotFound()); if (s.ok()) { s = rollback_batch_->Put(cf_handle, key, pinnable_val); diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 3ffa2e0c62a..272a2ab4862 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -891,9 +891,12 @@ Status WriteBatchWithIndex::GetFromBatchAndDB( if (!callback) { s = db->Get(read_options, column_family, key, pinnable_val); } else { + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = pinnable_val; + get_impl_options.callback = callback; s = static_cast_with_check(db->GetRootDB()) - ->GetImpl(read_options, column_family, key, pinnable_val, nullptr, - callback); + ->GetImpl(read_options, key, get_impl_options); } if (s.ok() || s.IsNotFound()) { // DB Get Succeeded From 6e78fe3c8d35fa1c0836af4501e0f272bc363bab Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Tue, 6 Aug 2019 18:47:39 -0700 Subject: [PATCH 284/572] Pysim more algorithms (#5644) Summary: This PR adds four more eviction policies. - OPT [1] - Hyperbolic caching [2] - ARC [3] - GreedyDualSize [4] [1] L. A. Belady. 1966. A Study of Replacement Algorithms for a Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101. DOI=http://dx.doi.org/10.1147/sj.52.0078 [2] Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017. Hyperbolic caching: flexible caching for web applications. In Proceedings of the 2017 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511. [3] Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA, USA, 115-130. [4] N. Young. The k-server dual and loose competitiveness for paging. Algorithmica, June 1994, vol. 11,(no.6):525-41. Rewritten version of ''On-line caching as cache size varies'', in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5644 Differential Revision: D16548817 Pulled By: HaoyuHuang fbshipit-source-id: 838f76db9179f07911abaab46c97e1c929cfcd63 --- .../block_cache_analyzer/block_cache_pysim.py | 1636 ++++++++++++++--- .../block_cache_analyzer/block_cache_pysim.sh | 86 +- .../block_cache_pysim_test.py | 478 ++++- .../block_cache_trace_analyzer.cc | 162 +- .../block_cache_trace_analyzer.h | 14 +- .../block_cache_trace_analyzer_test.cc | 31 +- trace_replay/block_cache_tracer.cc | 39 +- trace_replay/block_cache_tracer.h | 9 + .../simulator_cache/cache_simulator_test.cc | 26 +- 9 files changed, 2111 insertions(+), 370 deletions(-) diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py index 63e367be5a7..67307df5329 100644 --- a/tools/block_cache_analyzer/block_cache_pysim.py +++ b/tools/block_cache_analyzer/block_cache_pysim.py @@ -2,15 +2,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import gc +import heapq import random import sys import time +from collections import OrderedDict from os import path import numpy as np -kSampleSize = 16 # The sample size used when performing eviction. +kSampleSize = 64 # The sample size used when performing eviction. kMicrosInSecond = 1000000 kSecondsInMinute = 60 kSecondsInHour = 3600 @@ -39,11 +41,19 @@ def __init__( key_id, kv_size, is_hit, + referenced_key_exist_in_block, + num_keys_in_block, + table_id, + seq_number, + block_key_size, + key_size, + block_offset_in_file, + next_access_seq_no, ): self.access_time = access_time self.block_id = block_id self.block_type = block_type - self.block_size = block_size + self.block_size = block_size + block_key_size self.cf_id = cf_id self.cf_name = cf_name self.level = level @@ -60,22 +70,46 @@ def __init__( self.is_hit = True else: self.is_hit = False + if referenced_key_exist_in_block == 1: + self.referenced_key_exist_in_block = True + else: + self.referenced_key_exist_in_block = False + self.num_keys_in_block = num_keys_in_block + self.table_id = table_id + self.seq_number = seq_number + self.block_key_size = block_key_size + self.key_size = key_size + self.block_offset_in_file = block_offset_in_file + self.next_access_seq_no = next_access_seq_no class CacheEntry: """A cache entry stored in the cache.""" - def __init__(self, value_size, cf_id, level, block_type, access_number): + def __init__( + self, + value_size, + cf_id, + level, + block_type, + table_id, + access_number, + time_s, + num_hits=0, + ): self.value_size = value_size self.last_access_number = access_number - self.num_hits = 0 + self.num_hits = num_hits self.cf_id = 0 self.level = level self.block_type = block_type + self.last_access_time = time_s + self.insertion_time = time_s + self.table_id = table_id def __repr__(self): """Debug string.""" - return "s={},last={},hits={},cf={},l={},bt={}".format( + return "(s={},last={},hits={},cf={},l={},bt={})\n".format( self.value_size, self.last_access_number, self.num_hits, @@ -84,6 +118,22 @@ def __repr__(self): self.block_type, ) + def cost_class(self, cost_class_label): + if cost_class_label == "table_bt": + return "{}-{}".format(self.table_id, self.block_type) + elif cost_class_label == "table": + return "{}".format(self.table_id) + elif cost_class_label == "bt": + return "{}".format(self.block_type) + elif cost_class_label == "cf": + return "{}".format(self.cf_id) + elif cost_class_label == "cf_bt": + return "{}-{}".format(self.cf_id, self.block_type) + elif cost_class_label == "table_level_bt": + return "{}-{}-{}".format(self.table_id, self.level, self.block_type) + assert False, "Unknown cost class label {}".format(cost_class_label) + return None + class HashEntry: """A hash entry stored in a hash table.""" @@ -106,30 +156,55 @@ class HashTable: """ def __init__(self): - self.table = [None] * 32 + self.initial_size = 32 + self.table = [None] * self.initial_size self.elements = 0 def random_sample(self, sample_size): """Randomly sample 'sample_size' hash entries from the table.""" samples = [] - index = random.randint(0, len(self.table)) - pos = (index + 1) % len(self.table) - searches = 0 + index = random.randint(0, len(self.table) - 1) + pos = index # Starting from index, adding hash entries to the sample list until # sample_size is met or we ran out of entries. - while pos != index and len(samples) < sample_size: + while True: if self.table[pos] is not None: for i in range(len(self.table[pos])): if self.table[pos][i] is None: continue samples.append(self.table[pos][i]) - if len(samples) > sample_size: + if len(samples) == sample_size: break pos += 1 pos = pos % len(self.table) - searches += 1 + if pos == index or len(samples) == sample_size: + break + assert len(samples) <= sample_size return samples + def __repr__(self): + all_entries = [] + for i in range(len(self.table)): + if self.table[i] is None: + continue + for j in range(len(self.table[i])): + if self.table[i][j] is not None: + all_entries.append(self.table[i][j]) + return "{}".format(all_entries) + + def values(self): + all_values = [] + for i in range(len(self.table)): + if self.table[i] is None: + continue + for j in range(len(self.table[i])): + if self.table[i][j] is not None: + all_values.append(self.table[i][j].value) + return all_values + + def __len__(self): + return self.elements + def insert(self, key, hash, value): """ Insert a hash entry in the table. Replace the old entry if it already @@ -140,19 +215,21 @@ def insert(self, key, hash, value): index = hash % len(self.table) if self.table[index] is None: self.table[index] = [] + # Search for the entry first. for i in range(len(self.table[index])): - if self.table[index][i] is not None: - if ( - self.table[index][i].hash == hash - and self.table[index][i].key == key - ): - # The entry already exists in the table. - self.table[index][i] = HashEntry(key, hash, value) - return + if self.table[index][i] is None: continue - self.table[index][i] = HashEntry(key, hash, value) - inserted = True - break + if self.table[index][i].hash == hash and self.table[index][i].key == key: + # The entry already exists in the table. + self.table[index][i] = HashEntry(key, hash, value) + return + + # Find an empty slot. + for i in range(len(self.table[index])): + if self.table[index][i] is None: + self.table[index][i] = HashEntry(key, hash, value) + inserted = True + break if not inserted: self.table[index].append(HashEntry(key, hash, value)) self.elements += 1 @@ -160,7 +237,7 @@ def insert(self, key, hash, value): def resize(self, new_size): if new_size == len(self.table): return - if new_size == 0: + if new_size < self.initial_size: return if self.elements < 100: return @@ -184,29 +261,31 @@ def resize(self, new_size): gc.collect() def grow(self): - if self.elements < len(self.table): + if self.elements < 4 * len(self.table): return - new_size = int(len(self.table) * 1.2) + new_size = int(len(self.table) * 1.5) self.resize(new_size) def delete(self, key, hash): index = hash % len(self.table) - entries = self.table[index] deleted = False - if entries is None: + deleted_entry = None + if self.table[index] is None: return - for i in range(len(entries)): + for i in range(len(self.table[index])): if ( - entries[i] is not None - and entries[i].hash == hash - and entries[i].key == key + self.table[index][i] is not None + and self.table[index][i].hash == hash + and self.table[index][i].key == key ): - entries[i] = None + deleted_entry = self.table[index][i] + self.table[index][i] = None self.elements -= 1 deleted = True break if deleted: self.shrink() + return deleted_entry def shrink(self): if self.elements * 2 >= len(self.table): @@ -216,12 +295,15 @@ def shrink(self): def lookup(self, key, hash): index = hash % len(self.table) - entries = self.table[index] - if entries is None: + if self.table[index] is None: return None - for entry in entries: - if entry is not None and entry.hash == hash and entry.key == key: - return entry.value + for i in range(len(self.table[index])): + if ( + self.table[index][i] is not None + and self.table[index][i].hash == hash + and self.table[index][i].key == key + ): + return self.table[index][i].value return None @@ -231,9 +313,10 @@ def __init__(self, time_unit): self.num_accesses = 0 self.time_unit = time_unit self.time_misses = {} + self.time_miss_bytes = {} self.time_accesses = {} - def update_metrics(self, access_time, is_hit): + def update_metrics(self, access_time, is_hit, miss_bytes): access_time /= kMicrosInSecond * self.time_unit self.num_accesses += 1 if access_time not in self.time_accesses: @@ -243,20 +326,41 @@ def update_metrics(self, access_time, is_hit): self.num_misses += 1 if access_time not in self.time_misses: self.time_misses[access_time] = 0 + self.time_miss_bytes[access_time] = 0 self.time_misses[access_time] += 1 + self.time_miss_bytes[access_time] += miss_bytes def reset_counter(self): self.num_misses = 0 self.num_accesses = 0 + self.time_miss_bytes.clear() + self.time_misses.clear() + self.time_accesses.clear() + + def compute_miss_bytes(self): + miss_bytes = [] + for at in self.time_miss_bytes: + miss_bytes.append(self.time_miss_bytes[at]) + miss_bytes = sorted(miss_bytes) + avg_miss_bytes = 0 + p95_miss_bytes = 0 + for i in range(len(miss_bytes)): + avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes)) + + p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1) + p95_miss_bytes = miss_bytes[p95_index] + return avg_miss_bytes, p95_miss_bytes def miss_ratio(self): return float(self.num_misses) * 100.0 / float(self.num_accesses) - def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end): + def write_miss_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): start /= kMicrosInSecond * self.time_unit end /= kMicrosInSecond * self.time_unit - header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) if not path.exists(header_file_path): with open(header_file_path, "w+") as header_file: @@ -264,8 +368,8 @@ def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end): for trace_time in range(start, end): header += ",{}".format(trace_time) header_file.write(header + "\n") - file_path = "{}/data-ml-miss-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: row = "{}".format(cache_type) @@ -273,11 +377,13 @@ def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end): row += ",{}".format(self.time_misses.get(trace_time, 0)) file.write(row + "\n") - def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, end): + def write_miss_ratio_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): start /= kMicrosInSecond * self.time_unit end /= kMicrosInSecond * self.time_unit - header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) if not path.exists(header_file_path): with open(header_file_path, "w+") as header_file: @@ -285,8 +391,8 @@ def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, e for trace_time in range(start, end): header += ",{}".format(trace_time) header_file.write(header + "\n") - file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: row = "{}".format(cache_type) @@ -322,11 +428,13 @@ def update_metrics(self, access_time, selected_policy): self.time_selected_polices[access_time][policy_name] = 0 self.time_selected_polices[access_time][policy_name] += 1 - def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end): + def write_policy_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): start /= kMicrosInSecond * self.time_unit end /= kMicrosInSecond * self.time_unit - header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) if not path.exists(header_file_path): with open(header_file_path, "w+") as header_file: @@ -334,8 +442,8 @@ def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end): for trace_time in range(start, end): header += ",{}".format(trace_time) header_file.write(header + "\n") - file_path = "{}/data-ml-policy-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: for policy in self.policy_names: @@ -350,12 +458,12 @@ def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end): file.write(row + "\n") def write_policy_ratio_timeline( - self, cache_type, cache_size, file_path, start, end + self, cache_type, cache_size, target_cf_name, file_path, start, end ): start /= kMicrosInSecond * self.time_unit end /= kMicrosInSecond * self.time_unit - header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) if not path.exists(header_file_path): with open(header_file_path, "w+") as header_file: @@ -363,8 +471,8 @@ def write_policy_ratio_timeline( for trace_time in range(start, end): header += ",{}".format(trace_time) header_file.write(header + "\n") - file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}".format( - result_dir, self.time_unit, cache_type, cache_size + file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: for policy in self.policy_names: @@ -400,7 +508,7 @@ def evict(self, key, max_size): def delete(self, key): self.evicted_keys.pop(key, None) - def prioritize_samples(self, samples): + def prioritize_samples(self, samples, auxilliary_info): raise NotImplementedError def policy_name(self): @@ -413,7 +521,7 @@ def generate_reward(self, key): class LRUPolicy(Policy): - def prioritize_samples(self, samples): + def prioritize_samples(self, samples, auxilliary_info): return sorted( samples, cmp=lambda e1, e2: e1.value.last_access_number @@ -425,7 +533,7 @@ def policy_name(self): class MRUPolicy(Policy): - def prioritize_samples(self, samples): + def prioritize_samples(self, samples, auxilliary_info): return sorted( samples, cmp=lambda e1, e2: e2.value.last_access_number @@ -437,175 +545,478 @@ def policy_name(self): class LFUPolicy(Policy): - def prioritize_samples(self, samples): + def prioritize_samples(self, samples, auxilliary_info): return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits) def policy_name(self): return "lfu" -class MLCache(object): - def __init__(self, cache_size, enable_cache_row_key, policies): +class HyperbolicPolicy(Policy): + """ + An implementation of Hyperbolic caching. + + Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017. + Hyperbolic caching: flexible caching for web applications. In Proceedings + of the 2017 USENIX Conference on Usenix Annual Technical Conference + (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511. + """ + + def compare(self, e1, e2, now): + e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float( + e1.value.value_size + ) + e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float( + e2.value.value_size + ) + if e1_duration == e2_duration: + return e1.value.num_hits - e2.value.num_hits + if e1_duration == 0: + return 1 + if e2_duration == 0: + return 1 + diff = (float(e1.value.num_hits) / (float(e1_duration))) - ( + float(e2.value.num_hits) / float(e2_duration) + ) + if diff == 0: + return 0 + elif diff > 0: + return 1 + else: + return -1 + + def prioritize_samples(self, samples, auxilliary_info): + assert len(auxilliary_info) == 3 + now = auxilliary_info[0] + return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now)) + + def policy_name(self): + return "hb" + + +class CostClassPolicy(Policy): + """ + We calculate the hit density of a cost class as + number of hits / total size in cache * average duration in the cache. + + An entry has a higher priority if its class's hit density is higher. + """ + + def compare(self, e1, e2, now, cost_classes, cost_class_label): + e1_class = e1.value.cost_class(cost_class_label) + e2_class = e2.value.cost_class(cost_class_label) + + assert e1_class in cost_classes + assert e2_class in cost_classes + + e1_entry = cost_classes[e1_class] + e2_entry = cost_classes[e2_class] + e1_density = e1_entry.density(now) + e2_density = e2_entry.density(now) + e1_hits = cost_classes[e1_class].hits + e2_hits = cost_classes[e2_class].hits + + if e1_density == e2_density: + return e1_hits - e2_hits + + if e1_entry.num_entries_in_cache == 0: + return -1 + if e2_entry.num_entries_in_cache == 0: + return 1 + + if e1_density == 0: + return 1 + if e2_density == 0: + return -1 + diff = (float(e1_hits) / float(e1_density)) - ( + float(e2_hits) / float(e2_density) + ) + if diff == 0: + return 0 + elif diff > 0: + return 1 + else: + return -1 + + def prioritize_samples(self, samples, auxilliary_info): + assert len(auxilliary_info) == 3 + now = auxilliary_info[0] + cost_classes = auxilliary_info[1] + cost_class_label = auxilliary_info[2] + return sorted( + samples, + cmp=lambda e1, e2: self.compare( + e1, e2, now, cost_classes, cost_class_label + ), + ) + + def policy_name(self): + return "cc" + + +class Cache(object): + """ + This is the base class for the implementations of alternative cache + replacement policies. + """ + + def __init__(self, cache_size, enable_cache_row_key): self.cache_size = cache_size self.used_size = 0 + self.per_second_miss_ratio_stats = MissRatioStats(1) self.miss_ratio_stats = MissRatioStats(kSecondsInMinute) - self.policy_stats = PolicyStats(kSecondsInMinute, policies) self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour) - self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies) - self.table = HashTable() + # 0: disabled. 1: enabled. Insert both row and the refereneced data block. + # 2: enabled. Insert only the row but NOT the referenced data block. self.enable_cache_row_key = enable_cache_row_key self.get_id_row_key_map = {} - self.policies = policies + self.max_seen_get_id = 0 + self.retain_get_id_range = 100000 - def _lookup(self, key, hash): - value = self.table.lookup(key, hash) - if value is not None: - value.last_access_number = self.miss_ratio_stats.num_accesses - value.num_hits += 1 - return True - return False + def block_key(self, trace_record): + return "b{}".format(trace_record.block_id) - def _select_policy(self, trace_record, key): - raise NotImplementedError + def row_key(self, trace_record): + return "g{}-{}".format(trace_record.fd, trace_record.key_id) - def cache_name(self): + def _lookup(self, trace_record, key, hash): + """ + Look up the key in the cache. + Returns true upon a cache hit, false otherwise. + """ raise NotImplementedError - def _evict(self, policy_index, value_size): - # Randomly sample n entries. - samples = self.table.random_sample(kSampleSize) - samples = self.policies[policy_index].prioritize_samples(samples) - for hash_entry in samples: - self.used_size -= hash_entry.value.value_size - self.table.delete(hash_entry.key, hash_entry.hash) - self.policies[policy_index].evict( - key=hash_entry.key, max_size=self.table.elements - ) - if self.used_size + value_size <= self.cache_size: - break + def _evict(self, trace_record, key, hash, value_size): + """ + Evict entries in the cache until there is enough room to insert the new + entry with 'value_size'. + """ + raise NotImplementedError def _insert(self, trace_record, key, hash, value_size): - if value_size > self.cache_size: - return - policy_index = self._select_policy(trace_record, key) - self.policies[policy_index].delete(key) - self.policy_stats.update_metrics(trace_record.access_time, policy_index) - self.per_hour_policy_stats.update_metrics( - trace_record.access_time, policy_index - ) - while self.used_size + value_size > self.cache_size: - self._evict(policy_index, value_size) - self.table.insert( - key, - hash, - CacheEntry( - value_size, - trace_record.cf_id, - trace_record.level, - trace_record.block_type, - self.miss_ratio_stats.num_accesses, - ), - ) - self.used_size += value_size + """ + Insert the new entry into the cache. + """ + raise NotImplementedError - def _access_kv(self, trace_record, key, hash, value_size, no_insert): - if self._lookup(key, hash): - return True - if not no_insert and value_size > 0: - self._insert(trace_record, key, hash, value_size) + def _should_admit(self, trace_record, key, hash, value_size): + """ + A custom admission policy to decide whether we should admit the new + entry upon a cache miss. + Returns true if the new entry should be admitted, false otherwise. + """ + raise NotImplementedError + + def cache_name(self): + """ + The name of the replacement policy. + """ + raise NotImplementedError + + def is_ml_cache(self): return False - def _update_stats(self, access_time, is_hit): - self.miss_ratio_stats.update_metrics(access_time, is_hit) - self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit) + def _update_stats(self, access_time, is_hit, miss_bytes): + self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) + self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) + self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) def access(self, trace_record): + """ + Access a trace record. The simulator calls this function to access a + trace record. + """ assert self.used_size <= self.cache_size if ( - self.enable_cache_row_key + self.enable_cache_row_key > 0 and trace_record.caller == 1 and trace_record.key_id != 0 and trace_record.get_id != 0 ): # This is a get request. - if trace_record.get_id not in self.get_id_row_key_map: - self.get_id_row_key_map[trace_record.get_id] = {} - self.get_id_row_key_map[trace_record.get_id]["h"] = False - if self.get_id_row_key_map[trace_record.get_id]["h"]: - # We treat future accesses as hits since this get request - # completes. - self._update_stats(trace_record.access_time, is_hit=True) - return - if trace_record.key_id not in self.get_id_row_key_map[trace_record.get_id]: - # First time seen this key. - is_hit = self._access_kv( - trace_record, - key="g{}".format(trace_record.key_id), - hash=trace_record.key_id, - value_size=trace_record.kv_size, - no_insert=False, - ) - inserted = False - if trace_record.kv_size > 0: - inserted = True - self.get_id_row_key_map[trace_record.get_id][ - trace_record.key_id - ] = inserted - self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit - if self.get_id_row_key_map[trace_record.get_id]["h"]: - # We treat future accesses as hits since this get request - # completes. - self._update_stats(trace_record.access_time, is_hit=True) - return - # Access its blocks. + self._access_row(trace_record) + return + is_hit = self._access_kv( + trace_record, + self.block_key(trace_record), + trace_record.block_id, + trace_record.block_size, + trace_record.no_insert, + ) + self._update_stats( + trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size + ) + + def _access_row(self, trace_record): + row_key = self.row_key(trace_record) + self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id) + self.get_id_row_key_map.pop( + self.max_seen_get_id - self.retain_get_id_range, None + ) + if trace_record.get_id not in self.get_id_row_key_map: + self.get_id_row_key_map[trace_record.get_id] = {} + self.get_id_row_key_map[trace_record.get_id]["h"] = False + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + # print("row hit 1") + self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0) + return + if row_key not in self.get_id_row_key_map[trace_record.get_id]: + # First time seen this key. is_hit = self._access_kv( trace_record, - key="b{}".format(trace_record.block_id), - hash=trace_record.block_id, - value_size=trace_record.block_size, - no_insert=trace_record.no_insert, + key=row_key, + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, ) - self._update_stats(trace_record.access_time, is_hit) - if ( - trace_record.kv_size > 0 - and not self.get_id_row_key_map[trace_record.get_id][ - trace_record.key_id - ] - ): - # Insert the row key-value pair. - self._access_kv( - trace_record, - key="g{}".format(trace_record.key_id), - hash=trace_record.key_id, - value_size=trace_record.kv_size, - no_insert=False, - ) - # Mark as inserted. - self.get_id_row_key_map[trace_record.get_id][trace_record.key_id] = True + inserted = False + if trace_record.kv_size > 0: + inserted = True + self.get_id_row_key_map[trace_record.get_id][row_key] = inserted + self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + # print("row hit 2") + self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0) return - # Access the block. + # Access its blocks. + no_insert = trace_record.no_insert + if ( + self.enable_cache_row_key == 2 + and trace_record.kv_size > 0 + and trace_record.block_type == 9 + ): + no_insert = True is_hit = self._access_kv( trace_record, - key="b{}".format(trace_record.block_id), + key=self.block_key(trace_record), hash=trace_record.block_id, value_size=trace_record.block_size, - no_insert=trace_record.no_insert, + no_insert=no_insert, + ) + self._update_stats( + trace_record.access_time, is_hit, miss_bytes=trace_record.block_size + ) + if ( + trace_record.kv_size > 0 + and not self.get_id_row_key_map[trace_record.get_id][row_key] + ): + # Insert the row key-value pair. + self._access_kv( + trace_record, + key=row_key, + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, + ) + # Mark as inserted. + self.get_id_row_key_map[trace_record.get_id][row_key] = True + + def _access_kv(self, trace_record, key, hash, value_size, no_insert): + # Sanity checks. + assert self.used_size <= self.cache_size + if self._lookup(trace_record, key, hash): + # A cache hit. + return True + if no_insert or value_size <= 0: + return False + # A cache miss. + if value_size > self.cache_size: + # The block is too large to fit into the cache. + return False + self._evict(trace_record, key, hash, value_size) + if self._should_admit(trace_record, key, hash, value_size): + self._insert(trace_record, key, hash, value_size) + self.used_size += value_size + return False + + +class CostClassEntry: + """ + A cost class maintains aggregated statistics of cached entries in a class. + For example, we may define block type as a class. Then, cached blocks of the + same type will share one cost class entry. + """ + + def __init__(self): + self.hits = 0 + self.num_entries_in_cache = 0 + self.size_in_cache = 0 + self.sum_insertion_times = 0 + self.sum_last_access_time = 0 + + def insert(self, trace_record, key, value_size): + self.size_in_cache += value_size + self.num_entries_in_cache += 1 + self.sum_insertion_times += trace_record.access_time / kMicrosInSecond + self.sum_last_access_time += trace_record.access_time / kMicrosInSecond + + def remove(self, insertion_time, last_access_time, key, value_size, num_hits): + self.hits -= num_hits + self.num_entries_in_cache -= 1 + self.sum_insertion_times -= insertion_time / kMicrosInSecond + self.size_in_cache -= value_size + self.sum_last_access_time -= last_access_time / kMicrosInSecond + + def update_on_hit(self, trace_record, last_access_time): + self.hits += 1 + self.sum_last_access_time -= last_access_time / kMicrosInSecond + self.sum_last_access_time += trace_record.access_time / kMicrosInSecond + + def avg_lifetime_in_cache(self, now): + avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache + return now / kMicrosInSecond - avg_insertion_time + + def avg_last_access_time(self): + if self.num_entries_in_cache == 0: + return 0 + return float(self.sum_last_access_time) / float(self.num_entries_in_cache) + + def avg_size(self): + if self.num_entries_in_cache == 0: + return 0 + return float(self.sum_last_access_time) / float(self.num_entries_in_cache) + + def density(self, now): + avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache + in_cache_duration = now / kMicrosInSecond - avg_insertion_time + return self.size_in_cache * in_cache_duration + + +class MLCache(Cache): + """ + MLCache is the base class for implementations of alternative replacement + policies using reinforcement learning. + """ + + def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): + super(MLCache, self).__init__(cache_size, enable_cache_row_key) + self.table = HashTable() + self.policy_stats = PolicyStats(kSecondsInMinute, policies) + self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies) + self.policies = policies + self.cost_classes = {} + self.cost_class_label = cost_class_label + + def is_ml_cache(self): + return True + + def _lookup(self, trace_record, key, hash): + value = self.table.lookup(key, hash) + if value is not None: + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = value.cost_class(self.cost_class_label) + assert cost_class in self.cost_classes + self.cost_classes[cost_class].update_on_hit( + trace_record, value.last_access_time + ) + # Update the entry's last access time. + self.table.insert( + key, + hash, + CacheEntry( + value_size=value.value_size, + cf_id=value.cf_id, + level=value.level, + block_type=value.block_type, + table_id=value.table_id, + access_number=self.miss_ratio_stats.num_accesses, + time_s=trace_record.access_time, + num_hits=value.num_hits + 1, + ), + ) + return True + return False + + def _evict(self, trace_record, key, hash, value_size): + # Select a policy, random sample kSampleSize keys from the cache, then + # evict keys in the sample set until we have enough room for the new + # entry. + policy_index = self._select_policy(trace_record, key) + assert policy_index < len(self.policies) and policy_index >= 0 + self.policies[policy_index].delete(key) + self.policy_stats.update_metrics(trace_record.access_time, policy_index) + self.per_hour_policy_stats.update_metrics( + trace_record.access_time, policy_index + ) + while self.used_size + value_size > self.cache_size: + # Randomly sample n entries. + samples = self.table.random_sample(kSampleSize) + samples = self.policies[policy_index].prioritize_samples( + samples, + [trace_record.access_time, self.cost_classes, self.cost_class_label], + ) + for hash_entry in samples: + assert self.table.delete(hash_entry.key, hash_entry.hash) is not None + self.used_size -= hash_entry.value.value_size + self.policies[policy_index].evict( + key=hash_entry.key, max_size=self.table.elements + ) + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = hash_entry.value.cost_class(self.cost_class_label) + assert cost_class in self.cost_classes + self.cost_classes[cost_class].remove( + hash_entry.value.insertion_time, + hash_entry.value.last_access_time, + key, + hash_entry.value.value_size, + hash_entry.value.num_hits, + ) + if self.used_size + value_size <= self.cache_size: + break + + def _insert(self, trace_record, key, hash, value_size): + assert self.used_size + value_size <= self.cache_size + entry = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + self.miss_ratio_stats.num_accesses, + trace_record.access_time, ) - self._update_stats(trace_record.access_time, is_hit) + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = entry.cost_class(self.cost_class_label) + if cost_class not in self.cost_classes: + self.cost_classes[cost_class] = CostClassEntry() + self.cost_classes[cost_class].insert(trace_record, key, value_size) + self.table.insert(key, hash, entry) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def _select_policy(self, trace_record, key): + raise NotImplementedError class ThompsonSamplingCache(MLCache): """ - An implementation of Thompson Sampling for the Bernoulli Bandit [1]. - [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, + An implementation of Thompson Sampling for the Bernoulli Bandit. + + Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. Trends Mach. Learn. 11, 1 (July 2018), 1-96. DOI: https://doi.org/10.1561/2200000070 """ - def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b=1): + def __init__( + self, + cache_size, + enable_cache_row_key, + policies, + cost_class_label, + init_a=1, + init_b=1, + ): super(ThompsonSamplingCache, self).__init__( - cache_size, enable_cache_row_key, policies + cache_size, enable_cache_row_key, policies, cost_class_label ) self._as = {} self._bs = {} @@ -614,6 +1025,8 @@ def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b= self._bs = [init_b] * len(self.policies) def _select_policy(self, trace_record, key): + if len(self.policies) == 1: + return 0 samples = [ np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies)) ] @@ -626,23 +1039,28 @@ def _select_policy(self, trace_record, key): def cache_name(self): if self.enable_cache_row_key: - return "Hybrid ThompsonSampling (ts_hybrid)" - return "ThompsonSampling (ts)" + return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format( + self.cost_class_label + ) + return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label) class LinUCBCache(MLCache): """ - An implementation of LinUCB with disjoint linear models [2]. - [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. + An implementation of LinUCB with disjoint linear models. + + Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. A contextual-bandit approach to personalized news article recommendation. In Proceedings of the 19th international conference on World wide web (WWW '10). ACM, New York, NY, USA, 661-670. DOI=http://dx.doi.org/10.1145/1772690.1772758 """ - def __init__(self, cache_size, enable_cache_row_key, policies): - super(LinUCBCache, self).__init__(cache_size, enable_cache_row_key, policies) - self.nfeatures = 4 # Block type, caller, level, cf. + def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): + super(LinUCBCache, self).__init__( + cache_size, enable_cache_row_key, policies, cost_class_label + ) + self.nfeatures = 4 # Block type, level, cf. self.th = np.zeros((len(self.policies), self.nfeatures)) self.eps = 0.2 self.b = np.zeros_like(self.th) @@ -655,11 +1073,12 @@ def __init__(self, cache_size, enable_cache_row_key, policies): self.alph = 0.2 def _select_policy(self, trace_record, key): + if len(self.policies) == 1: + return 0 x_i = np.zeros(self.nfeatures) # The current context vector x_i[0] = trace_record.block_type - x_i[1] = trace_record.caller - x_i[2] = trace_record.level - x_i[3] = trace_record.cf_id + x_i[1] = trace_record.level + x_i[2] = trace_record.cf_id p = np.zeros(len(self.policies)) for a in range(len(self.policies)): self.th_hat[a] = self.A_inv[a].dot(self.b[a]) @@ -679,8 +1098,429 @@ def _select_policy(self, trace_record, key): def cache_name(self): if self.enable_cache_row_key: - return "Hybrid LinUCB (linucb_hybrid)" - return "LinUCB (linucb)" + return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format( + self.cost_class_label + ) + return "LinUCB with cost class {} (linucb)".format(self.cost_class_label) + + +class OPTCacheEntry: + """ + A cache entry for the OPT algorithm. The entries are sorted based on its + next access sequence number in reverse order, i.e., the entry which next + access is the furthest in the future is ordered before other entries. + """ + + def __init__(self, key, next_access_seq_no, value_size): + self.key = key + self.next_access_seq_no = next_access_seq_no + self.value_size = value_size + self.is_removed = False + + def __cmp__(self, other): + if other.next_access_seq_no != self.next_access_seq_no: + return other.next_access_seq_no - self.next_access_seq_no + return self.value_size - other.value_size + + def __repr__(self): + return "({} {} {} {})".format( + self.key, self.next_access_seq_no, self.value_size, self.is_removed + ) + + +class PQTable: + """ + A hash table with a priority queue. + """ + + def __init__(self): + # A list of entries arranged in a heap sorted based on the entry custom + # implementation of __cmp__ + self.pq = [] + self.table = {} + + def pqinsert(self, entry): + "Add a new key or update the priority of an existing key" + # Remove the entry from the table first. + removed_entry = self.table.pop(entry.key, None) + if removed_entry: + # Mark as removed since there is no 'remove' API in heappq. + # Instead, an entry in pq is removed lazily when calling pop. + removed_entry.is_removed = True + self.table[entry.key] = entry + heapq.heappush(self.pq, entry) + return removed_entry + + def pqpop(self): + while self.pq: + entry = heapq.heappop(self.pq) + if not entry.is_removed: + del self.table[entry.key] + return entry + return None + + def pqpeek(self): + while self.pq: + entry = self.pq[0] + if not entry.is_removed: + return entry + heapq.heappop(self.pq) + return + + def __contains__(self, k): + return k in self.table + + def __getitem__(self, k): + return self.table[k] + + def __len__(self): + return len(self.table) + + def values(self): + return self.table.values() + + +class OPTCache(Cache): + """ + An implementation of the Belady MIN algorithm. OPTCache evicts an entry + in the cache whose next access occurs furthest in the future. + + Note that Belady MIN algorithm is optimal assuming all blocks having the + same size and a missing entry will be inserted in the cache. + These are NOT true for the block cache trace since blocks have different + sizes and we may not insert a block into the cache upon a cache miss. + However, it is still useful to serve as a "theoretical upper bound" on the + lowest miss ratio we can achieve given a cache size. + + L. A. Belady. 1966. A Study of Replacement Algorithms for a + Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101. + DOI=http://dx.doi.org/10.1147/sj.52.0078 + """ + + def __init__(self, cache_size): + super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0) + self.table = PQTable() + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update its next access time. + assert ( + self.table.pqinsert( + OPTCacheEntry( + key, trace_record.next_access_seq_no, self.table[key].value_size + ) + ) + is not None + ) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_entry = self.table.pqpop() + assert evict_entry is not None + self.used_size -= evict_entry.value_size + + def _insert(self, trace_record, key, hash, value_size): + assert ( + self.table.pqinsert( + OPTCacheEntry(key, trace_record.next_access_seq_no, value_size) + ) + is None + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def cache_name(self): + return "Belady MIN (opt)" + + +class GDSizeEntry: + """ + A cache entry for the greedy dual size replacement policy. + """ + + def __init__(self, key, value_size, priority): + self.key = key + self.value_size = value_size + self.priority = priority + self.is_removed = False + + def __cmp__(self, other): + if other.priority != self.priority: + return self.priority - other.priority + return self.value_size - other.value_size + + def __repr__(self): + return "({} {} {} {})".format( + self.key, self.next_access_seq_no, self.value_size, self.is_removed + ) + + +class GDSizeCache(Cache): + """ + An implementation of the greedy dual size algorithm. + We define cost as an entry's size. + + See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html + and N. Young. The k-server dual and loose competitiveness for paging. + Algorithmica,June 1994, vol. 11,(no.6):525-41. + Rewritten version of ''On-line caching as cache size varies'', + in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key) + self.table = PQTable() + self.L = 0.0 + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid GreedyDualSize (gdsize_hybrid)" + return "GreedyDualSize (gdsize)" + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update its priority. + entry = self.table[key] + assert ( + self.table.pqinsert( + GDSizeEntry(key, entry.value_size, self.L + entry.value_size) + ) + is not None + ) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_entry = self.table.pqpop() + assert evict_entry is not None + self.L = evict_entry.priority + self.used_size -= evict_entry.value_size + + def _insert(self, trace_record, key, hash, value_size): + assert ( + self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size)) + is None + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + +class Deque(object): + """A Deque class facilitates the implementation of LRU and ARC.""" + + def __init__(self): + self.od = OrderedDict() + + def appendleft(self, k): + if k in self.od: + del self.od[k] + self.od[k] = None + + def pop(self): + item = self.od.popitem(last=False) if self.od else None + if item is not None: + return item[0] + return None + + def remove(self, k): + del self.od[k] + + def __len__(self): + return len(self.od) + + def __contains__(self, k): + return k in self.od + + def __iter__(self): + return reversed(self.od) + + def __repr__(self): + return "Deque(%r)" % (list(self),) + + +class ARCCache(Cache): + """ + An implementation of ARC. ARC assumes that all blocks are having the + same size. The size of index and filter blocks are variable. To accommodate + this, we modified ARC as follows: + 1) We use 16 KB as the average block size and calculate the number of blocks + (c) in the cache. + 2) When we insert an entry, the cache evicts entries in both t1 and t2 + queues until it has enough space for the new entry. This also requires + modification of the algorithm to maintain a maximum of 2*c blocks. + + Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low + Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on + File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA, + USA, 115-130. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(ARCCache, self).__init__(cache_size, enable_cache_row_key) + self.table = {} + self.c = cache_size / 16 * 1024 # Number of elements in the cache. + self.p = 0 # Target size for the list T1 + # L1: only once recently + self.t1 = Deque() # T1: recent cache entries + self.b1 = Deque() # B1: ghost entries recently evicted from the T1 cache + # L2: at least twice recently + self.t2 = Deque() # T2: frequent entries + self.b2 = Deque() # B2: ghost entries recently evicted from the T2 cache + + def _replace(self, key, value_size): + while self.used_size + value_size > self.cache_size: + if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)): + old = self.t1.pop() + self.b1.appendleft(old) + else: + if self.t2: + old = self.t2.pop() + self.b2.appendleft(old) + else: + old = self.t1.pop() + self.b1.appendleft(old) + self.used_size -= self.table[old].value_size + del self.table[old] + + def _lookup(self, trace_record, key, hash): + # Case I: key is in T1 or T2. + # Move key to MRU position in T2. + if key in self.t1: + self.t1.remove(key) + self.t2.appendleft(key) + return True + + if key in self.t2: + self.t2.remove(key) + self.t2.appendleft(key) + return True + return False + + def _evict(self, trace_record, key, hash, value_size): + # Case II: key is in B1 + # Move x from B1 to the MRU position in T2 (also fetch x to the cache). + if key in self.b1: + self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1)) + self._replace(key, value_size) + self.b1.remove(key) + self.t2.appendleft(key) + return + + # Case III: key is in B2 + # Move x from B2 to the MRU position in T2 (also fetch x to the cache). + if key in self.b2: + self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1)) + self._replace(key, value_size) + self.b2.remove(key) + self.t2.appendleft(key) + return + + # Case IV: key is not in (T1 u B1 u T2 u B2) + self._replace(key, value_size) + while len(self.t1) + len(self.b1) >= self.c and self.b1: + self.b1.pop() + + total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2) + while total >= (2 * self.c) and self.b2: + self.b2.pop() + total -= 1 + # Finally, move it to MRU position in T1. + self.t1.appendleft(key) + return + + def _insert(self, trace_record, key, hash, value_size): + self.table[key] = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + 0, + trace_record.access_time, + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid Adaptive Replacement Cache (arc_hybrid)" + return "Adaptive Replacement Cache (arc)" + + +class LRUCache(Cache): + """ + A strict LRU queue. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(LRUCache, self).__init__(cache_size, enable_cache_row_key) + self.table = {} + self.lru = Deque() + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid LRU (lru_hybrid)" + return "LRU (lru)" + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update LRU queue. + self.lru.remove(key) + self.lru.appendleft(key) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_key = self.lru.pop() + self.used_size -= self.table[evict_key].value_size + del self.table[evict_key] + + def _insert(self, trace_record, key, hash, value_size): + self.table[key] = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + 0, + trace_record.access_time, + ) + self.lru.appendleft(key) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + +class TraceCache(Cache): + """ + A trace cache. Lookup returns true if the trace observes a cache hit. + It is used to maintain cache hits observed in the trace. + """ + + def __init__(self, cache_size): + super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0) + + def _lookup(self, trace_record, key, hash): + return trace_record.is_hit + + def _evict(self, trace_record, key, hash, value_size): + pass + + def _insert(self, trace_record, key, hash, value_size): + pass + + def _should_admit(self, trace_record, key, hash, value_size): + return False + + def cache_name(self): + return "Trace" def parse_cache_size(cs): @@ -695,47 +1535,255 @@ def parse_cache_size(cs): def create_cache(cache_type, cache_size, downsample_size): - policies = [] - policies.append(LRUPolicy()) - policies.append(MRUPolicy()) - policies.append(LFUPolicy()) cache_size = cache_size / downsample_size - enable_cache_row_key = False + enable_cache_row_key = 0 + if "hybridn" in cache_type: + enable_cache_row_key = 2 + cache_type = cache_type[:-8] if "hybrid" in cache_type: - enable_cache_row_key = True + enable_cache_row_key = 1 cache_type = cache_type[:-7] if cache_type == "ts": - return ThompsonSamplingCache(cache_size, enable_cache_row_key, policies) + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()], + cost_class_label=None, + ) elif cache_type == "linucb": - return LinUCBCache(cache_size, enable_cache_row_key, policies) + return LinUCBCache( + cache_size, + enable_cache_row_key, + [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()], + cost_class_label=None, + ) + elif cache_type == "pylru": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None + ) + elif cache_type == "pymru": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None + ) + elif cache_type == "pylfu": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None + ) + elif cache_type == "pyhb": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [HyperbolicPolicy()], + cost_class_label=None, + ) + elif cache_type == "pycctbbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table_bt", + ) + elif cache_type == "pycccf": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf" + ) + elif cache_type == "pycctblevelbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table_level_bt", + ) + elif cache_type == "pycccfbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="cf_bt", + ) + elif cache_type == "pycctb": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table", + ) + elif cache_type == "pyccbt": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt" + ) + elif cache_type == "opt": + if enable_cache_row_key: + print("opt does not support hybrid mode.") + assert False + return OPTCache(cache_size) + elif cache_type == "trace": + if enable_cache_row_key: + print("trace does not support hybrid mode.") + assert False + return TraceCache(cache_size) + elif cache_type == "lru": + return LRUCache(cache_size, enable_cache_row_key) + elif cache_type == "arc": + return ARCCache(cache_size, enable_cache_row_key) + elif cache_type == "gdsize": + return GDSizeCache(cache_size, enable_cache_row_key) else: print("Unknown cache type {}".format(cache_type)) assert False return None -def run(trace_file_path, cache_type, cache, warmup_seconds): +class BlockAccessTimeline: + """ + BlockAccessTimeline stores all accesses of a block. + """ + + def __init__(self): + self.accesses = [] + self.current_access_index = 1 + + def get_next_access(self): + if self.current_access_index == len(self.accesses): + return sys.maxsize + next_access_seq_no = self.accesses[self.current_access_index] + self.current_access_index += 1 + return next_access_seq_no + + +def percent(e1, e2): + if e2 == 0: + return -1 + return float(e1) * 100.0 / float(e2) + + +def is_target_cf(access_cf, target_cf_name): + if target_cf_name == "all": + return True + return access_cf == target_cf_name + + +def run( + trace_file_path, + cache_type, + cache, + warmup_seconds, + max_accesses_to_process, + target_cf_name, +): warmup_complete = False - num = 0 + trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute) + access_seq_no = 0 + time_interval = 1 + start_time = time.time() trace_start_time = 0 trace_duration = 0 - start_time = time.time() + is_opt_cache = False + if cache.cache_name() == "Belady MIN (opt)": + is_opt_cache = True + + block_access_timelines = {} + num_no_inserts = 0 + num_blocks_with_no_size = 0 + num_inserts_block_with_no_size = 0 + + if is_opt_cache: + # Read all blocks in memory and stores their access times so that OPT + # can use this information to evict the cached key which next access is + # the furthest in the future. + print("Preprocessing block traces.") + with open(trace_file_path, "r") as trace_file: + for line in trace_file: + if ( + max_accesses_to_process != -1 + and access_seq_no > max_accesses_to_process + ): + break + ts = line.split(",") + timestamp = int(ts[0]) + cf_name = ts[5] + if not is_target_cf(cf_name, target_cf_name): + continue + if trace_start_time == 0: + trace_start_time = timestamp + trace_duration = timestamp - trace_start_time + block_id = int(ts[1]) + block_size = int(ts[3]) + no_insert = int(ts[9]) + if block_id not in block_access_timelines: + block_access_timelines[block_id] = BlockAccessTimeline() + if block_size == 0: + num_blocks_with_no_size += 1 + block_access_timelines[block_id].accesses.append(access_seq_no) + access_seq_no += 1 + if no_insert == 1: + num_no_inserts += 1 + if no_insert == 0 and block_size == 0: + num_inserts_block_with_no_size += 1 + if access_seq_no % 100 != 0: + continue + now = time.time() + if now - start_time > time_interval * 10: + print( + "Take {} seconds to process {} trace records with trace " + "duration of {} seconds. Throughput: {} records/second.".format( + now - start_time, + access_seq_no, + trace_duration / 1000000, + access_seq_no / (now - start_time), + ) + ) + time_interval += 1 + print( + "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size." + "{3} accesses, {4}({5:.2f}%) accesses with no_insert," + "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format( + len(block_access_timelines), + num_blocks_with_no_size, + percent(num_blocks_with_no_size, len(block_access_timelines)), + access_seq_no, + num_no_inserts, + percent(num_no_inserts, access_seq_no), + num_inserts_block_with_no_size, + percent(num_inserts_block_with_no_size, access_seq_no), + ) + ) + + access_seq_no = 0 time_interval = 1 - trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute) + start_time = time.time() + trace_start_time = 0 + trace_duration = 0 + print("Running simulated {} cache on block traces.".format(cache.cache_name())) with open(trace_file_path, "r") as trace_file: for line in trace_file: - num += 1 - if num % 1000000 == 0: + if ( + max_accesses_to_process != -1 + and access_seq_no > max_accesses_to_process + ): + break + if access_seq_no % 1000000 == 0: # Force a python gc periodically to reduce memory usage. gc.collect() ts = line.split(",") timestamp = int(ts[0]) + cf_name = ts[5] + if not is_target_cf(cf_name, target_cf_name): + continue if trace_start_time == 0: trace_start_time = timestamp trace_duration = timestamp - trace_start_time - if not warmup_complete and trace_duration > warmup_seconds * 1000000: + if ( + not warmup_complete + and warmup_seconds > 0 + and trace_duration > warmup_seconds * 1000000 + ): cache.miss_ratio_stats.reset_counter() warmup_complete = True + next_access_seq_no = 0 + block_id = int(ts[1]) + if is_opt_cache: + next_access_seq_no = block_access_timelines[block_id].get_next_access() record = TraceRecord( access_time=int(ts[0]), block_id=int(ts[1]), @@ -751,13 +1799,23 @@ def run(trace_file_path, cache_type, cache, warmup_seconds): key_id=int(ts[11]), kv_size=int(ts[12]), is_hit=int(ts[13]), + referenced_key_exist_in_block=int(ts[14]), + num_keys_in_block=int(ts[15]), + table_id=int(ts[16]), + seq_number=int(ts[17]), + block_key_size=int(ts[18]), + key_size=int(ts[19]), + block_offset_in_file=int(ts[20]), + next_access_seq_no=next_access_seq_no, ) trace_miss_ratio_stats.update_metrics( - record.access_time, is_hit=record.is_hit + record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size ) cache.access(record) + access_seq_no += 1 del record - if num % 100 != 0: + del ts + if access_seq_no % 100 != 0: continue # Report progress every 10 seconds. now = time.time() @@ -767,9 +1825,9 @@ def run(trace_file_path, cache_type, cache, warmup_seconds): "duration of {} seconds. Throughput: {} records/second. " "Trace miss ratio {}".format( now - start_time, - num, + access_seq_no, trace_duration / 1000000, - num / (now - start_time), + access_seq_no / (now - start_time), trace_miss_ratio_stats.miss_ratio(), ) ) @@ -787,19 +1845,33 @@ def run(trace_file_path, cache_type, cache, warmup_seconds): "Take {} seconds to process {} trace records with trace duration of {} " "seconds. Throughput: {} records/second. Trace miss ratio {}".format( now - start_time, - num, + access_seq_no, trace_duration / 1000000, - num / (now - start_time), + access_seq_no / (now - start_time), trace_miss_ratio_stats.miss_ratio(), ) ) + print( + "{},0,0,{},{},{}".format( + cache_type, + cache.cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) return trace_start_time, trace_duration def report_stats( - cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time + cache, + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, ): - cache_label = "{}-{}".format(cache_type, cache_size) + cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name) with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file: mrc_file.write( "{},0,0,{},{},{}\n".format( @@ -809,56 +1881,120 @@ def report_stats( cache.miss_ratio_stats.num_accesses, ) ) - cache.policy_stats.write_policy_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.policy_stats.write_policy_ratio_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.miss_ratio_stats.write_miss_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.miss_ratio_stats.write_miss_ratio_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.per_hour_policy_stats.write_policy_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.per_hour_policy_stats.write_policy_ratio_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.per_hour_miss_ratio_stats.write_miss_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) - cache.per_hour_miss_ratio_stats.write_miss_ratio_timeline( - cache_type, cache_size, result_dir, trace_start_time, trace_end_time - ) + + cache_stats = [ + cache.per_second_miss_ratio_stats, + cache.miss_ratio_stats, + cache.per_hour_miss_ratio_stats, + ] + for i in range(len(cache_stats)): + avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes() + + with open( + "{}/data-ml-avgmb-{}-{}".format( + result_dir, cache_stats[i].time_unit, cache_label + ), + "w+", + ) as mb_file: + mb_file.write( + "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes) + ) + + with open( + "{}/data-ml-p95mb-{}-{}".format( + result_dir, cache_stats[i].time_unit, cache_label + ), + "w+", + ) as mb_file: + mb_file.write( + "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes) + ) + + cache_stats[i].write_miss_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + cache_stats[i].write_miss_ratio_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + + if not cache.is_ml_cache(): + return + + policy_stats = [cache.policy_stats, cache.per_hour_policy_stats] + for i in range(len(policy_stats)): + policy_stats[i].write_policy_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + policy_stats[i].write_policy_ratio_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) if __name__ == "__main__": - if len(sys.argv) <= 6: + if len(sys.argv) <= 8: print( - "Must provide 6 arguments. " - "1) cache_type (ts, ts_hybrid, linucb, linucb_hybrid). " - "2) cache size (xM, xG, xT). " + "Must provide 8 arguments.\n" + "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, " + "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache " + "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. " + "Note that hybrid is not supported with opt and trace. \n" + "2) Cache size (xM, xG, xT).\n" "3) The sampling frequency used to collect the trace. (The " - "simulation scales down the cache size by the sampling frequency). " - "4) Warmup seconds (The number of seconds used for warmup). " - "5) Trace file path. " - "6) Result directory (A directory that saves generated results)" + "simulation scales down the cache size by the sampling frequency).\n" + "4) Warmup seconds (The number of seconds used for warmup).\n" + "5) Trace file path.\n" + "6) Result directory (A directory that saves generated results)\n" + "7) Max number of accesses to process\n" + "8) The target column family. (The simulation will only run " + "accesses on the target column family. If it is set to all, " + "it will run against all accesses.)" ) exit(1) + print("Arguments: {}".format(sys.argv)) cache_type = sys.argv[1] cache_size = parse_cache_size(sys.argv[2]) downsample_size = int(sys.argv[3]) warmup_seconds = int(sys.argv[4]) trace_file_path = sys.argv[5] result_dir = sys.argv[6] + max_accesses_to_process = int(sys.argv[7]) + target_cf_name = sys.argv[8] cache = create_cache(cache_type, cache_size, downsample_size) trace_start_time, trace_duration = run( - trace_file_path, cache_type, cache, warmup_seconds + trace_file_path, + cache_type, + cache, + warmup_seconds, + max_accesses_to_process, + target_cf_name, ) trace_end_time = trace_start_time + trace_duration report_stats( - cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time + cache, + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, ) diff --git a/tools/block_cache_analyzer/block_cache_pysim.sh b/tools/block_cache_analyzer/block_cache_pysim.sh index 58193a0635a..295f734aa05 100644 --- a/tools/block_cache_analyzer/block_cache_pysim.sh +++ b/tools/block_cache_analyzer/block_cache_pysim.sh @@ -10,6 +10,10 @@ # warmup_seconds: The number of seconds used for warmup. # max_jobs: The max number of concurrent pysims to run. +# Install required packages to run simulations. +# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel +ulimit -c 0 + if [ $# -ne 5 ]; then echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs" exit 0 @@ -20,17 +24,26 @@ result_dir="$2" downsample_size="$3" warmup_seconds="$4" max_jobs="$5" -current_jobs=0 +max_num_accesses=100000000 +current_jobs=1 ml_tmp_result_dir="$result_dir/ml" rm -rf "$ml_tmp_result_dir" mkdir -p "$result_dir" mkdir -p "$ml_tmp_result_dir" -for cache_type in "ts" "linucb" "ts_hybrid" "linucb_hybrid" +# Report miss ratio in the trace. +current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) +for cf_name in "all" +do +for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T" do -for cache_size in "16M" "256M" "1G" "2G" "4G" "8G" "12G" "16G" +for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid" #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace" do + if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then + # We only need to collect miss ratios observed in the trace once. + continue + fi while [ "$current_jobs" -ge "$max_jobs" ] do sleep 10 @@ -38,12 +51,13 @@ do current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) echo "Waiting jobs to complete. Number of running jobs: $current_jobs" done - output="log-ml-$cache_type-$cache_size" - echo "Running simulation for $cache_type and cache size $cache_size. Number of running jobs: $current_jobs. " - nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" >& $ml_tmp_result_dir/$output & + output="log-ml-$cache_type-$cache_size-$cf_name" + echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. " + nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" & current_jobs=$((current_jobs+1)) done done +done # Wait for all jobs to complete. while [ $current_jobs -gt 0 ] @@ -57,14 +71,14 @@ done echo "Combine individual pysim output files" rm -rf "$result_dir/ml_*" -mrc_file="$result_dir/ml_mrc" for header in "header-" "data-" do -for fn in $ml_tmp_result_dir/* +for fn in "$ml_tmp_result_dir"/* do sum_file="" time_unit="" capacity="" + target_cf_name="" if [[ $fn == *"timeline"* ]]; then tmpfn="$fn" IFS='-' read -ra elements <<< "$tmpfn" @@ -79,24 +93,43 @@ do done time_unit_index=$((time_unit_index+1)) capacity_index=$((time_unit_index+2)) + target_cf_name_index=$((time_unit_index+3)) time_unit="${elements[$time_unit_index]}_" capacity="${elements[$capacity_index]}_" + target_cf_name="${elements[$target_cf_name_index]}_" fi - if [[ $fn == "${header}ml-policy-timeline"* ]]; then - sum_file="$result_dir/ml_${capacity}${time_unit}policy_timeline" + if [[ $fn == *"${header}ml-policy-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline" + fi + if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline" fi - if [[ $fn == "${header}ml-policy-ratio-timeline"* ]]; then - sum_file="$result_dir/ml_${capacity}${time_unit}policy_ratio_timeline" + if [[ $fn == *"${header}ml-miss-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline" fi - if [[ $fn == "${header}ml-miss-timeline"* ]]; then - sum_file="$result_dir/ml_${capacity}${time_unit}miss_timeline" + if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline" + fi + if [[ $fn == *"${header}ml-mrc"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${target_cf_name}_mrc" fi - if [[ $fn == "${header}ml-miss-ratio-timeline"* ]]; then - sum_file="$result_dir/ml_${capacity}${time_unit}miss_ratio_timeline" + if [[ $fn == *"${header}ml-avgmb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb" fi - if [[ $fn == "${header}ml-mrc"* ]]; then - sum_file="$mrc_file" + if [[ $fn == *"${header}ml-p95mb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb" fi if [[ $sum_file == "" ]]; then continue @@ -106,13 +139,18 @@ do continue fi fi - cat "$ml_tmp_result_dir/$fn" >> "$sum_file" + cat "$fn" >> "$sum_file" done done echo "Done" -# Sort MRC file by cache_type and cache_size. -tmp_file="$result_dir/tmp_mrc" -cat "$mrc_file" | sort -t ',' -k1,1 -k4,4n > "$tmp_file" -cat "$tmp_file" > "$mrc_file" -rm -rf "$tmp_file" +for fn in $result_dir/* +do + if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then + # Sort MRC file by cache_type and cache_size. + tmp_file="$result_dir/tmp_mrc" + cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file" + cat "$tmp_file" > "$fn" + rm -rf "$tmp_file" + fi +done diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py index e298d7bbd6f..4b2bdeba656 100644 --- a/tools/block_cache_analyzer/block_cache_pysim_test.py +++ b/tools/block_cache_analyzer/block_cache_pysim_test.py @@ -1,17 +1,30 @@ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os import random +import sys from block_cache_pysim import ( + ARCCache, + CacheEntry, + GDSizeCache, HashTable, + HyperbolicPolicy, LFUPolicy, LinUCBCache, + LRUCache, LRUPolicy, MRUPolicy, + OPTCache, + OPTCacheEntry, ThompsonSamplingCache, + TraceCache, TraceRecord, + create_cache, + kMicrosInSecond, kSampleSize, + run, ) @@ -33,30 +46,44 @@ def test_hash_table(): records = 100 for i in range(n): key_id = random.randint(0, records) + v = random.randint(0, records) key = "k{}".format(key_id) - value = "v{}".format(key_id) - action = random.randint(0, 2) - # print "{}:{}:{}".format(action, key, value) + value = CacheEntry(v, v, v, v, v, v, v) + action = random.randint(0, 10) assert len(truth_map) == table.elements, "{} {} {}".format( len(truth_map), table.elements, i ) - if action == 0: - table.insert(key, key_id, value) - truth_map[key] = value - elif action == 1: + if action <= 8: if key in truth_map: assert table.lookup(key, key_id) is not None - assert truth_map[key] == table.lookup(key, key_id) + assert truth_map[key].value_size == table.lookup(key, key_id).value_size else: assert table.lookup(key, key_id) is None + table.insert(key, key_id, value) + truth_map[key] = value else: - table.delete(key, key_id) + deleted = table.delete(key, key_id) + if deleted: + assert key in truth_map if key in truth_map: del truth_map[key] + + # Check all keys are unique in the sample set. + for _i in range(10): + samples = table.random_sample(kSampleSize) + unique_keys = {} + for sample in samples: + unique_keys[sample.key] = True + assert len(samples) == len(unique_keys) + + assert len(table) == len(truth_map) + for key in truth_map: + assert table.lookup(key, int(key[1:])) is not None + assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size print("Test hash table: Success") -def assert_metrics(cache, expected_value): +def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True): assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format( expected_value[0], cache.used_size ) @@ -70,24 +97,35 @@ def assert_metrics(cache, expected_value): ), "Expected {}, Actual {}".format( expected_value[2], cache.miss_ratio_stats.num_misses ) - assert cache.table.elements == len(expected_value[3]) + len( + assert len(cache.table) == len(expected_value[3]) + len( expected_value[4] ), "Expected {}, Actual {}".format( len(expected_value[3]) + len(expected_value[4]), cache.table.elements ) for expeceted_k in expected_value[3]: - val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k) - assert val is not None - assert val.value_size == 1 + if custom_hashtable: + val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k) + else: + val = cache.table["b{}".format(expeceted_k)] + assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format( + expeceted_k, expected_value, cache.table + ) + assert val.value_size == expected_value_size for expeceted_k in expected_value[4]: - val = cache.table.lookup("g{}".format(expeceted_k), expeceted_k) + if custom_hashtable: + val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k) + else: + val = cache.table["g0-{}".format(expeceted_k)] assert val is not None - assert val.value_size == 1 + assert val.value_size == expected_value_size # Access k1, k1, k2, k3, k3, k3, k4 -def test_cache(policies, expected_value): - cache = ThompsonSamplingCache(3, False, policies) +# When k4 is inserted, +# LRU should evict k1. +# LFU should evict k2. +# MRU should evict k3. +def test_cache(cache, expected_value, custom_hashtable=True): k1 = TraceRecord( access_time=0, block_id=1, @@ -103,6 +141,14 @@ def test_cache(policies, expected_value): key_id=1, kv_size=5, is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, ) k2 = TraceRecord( access_time=1, @@ -119,6 +165,14 @@ def test_cache(policies, expected_value): key_id=1, kv_size=5, is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, ) k3 = TraceRecord( access_time=2, @@ -135,6 +189,14 @@ def test_cache(policies, expected_value): key_id=1, kv_size=5, is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, ) k4 = TraceRecord( access_time=3, @@ -151,6 +213,14 @@ def test_cache(policies, expected_value): key_id=1, kv_size=5, is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, ) sequence = [k1, k1, k2, k3, k3, k3] index = 0 @@ -167,20 +237,29 @@ def test_cache(policies, expected_value): expected_values.append([3, 5, 3, [1, 2, 3], []]) # Access k3, hit. expected_values.append([3, 6, 3, [1, 2, 3], []]) + access_time = 0 for access in sequence: + access.access_time = access_time cache.access(access) - assert_metrics(cache, expected_values[index]) + assert_metrics( + cache, + expected_values[index], + expected_value_size=1, + custom_hashtable=custom_hashtable, + ) + access_time += 1 index += 1 + k4.access_time = access_time cache.access(k4) - assert_metrics(cache, expected_value) + assert_metrics( + cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable + ) -def test_lru_cache(): +def test_lru_cache(cache, custom_hashtable): print("Test LRU cache") - policies = [] - policies.append(LRUPolicy()) # Access k4, miss. evict k1 - test_cache(policies, [3, 7, 4, [2, 3, 4], []]) + test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable) print("Test LRU cache: Success") @@ -189,7 +268,10 @@ def test_mru_cache(): policies = [] policies.append(MRUPolicy()) # Access k4, miss. evict k3 - test_cache(policies, [3, 7, 4, [1, 2, 4], []]) + test_cache( + ThompsonSamplingCache(3, False, policies, cost_class_label=None), + [3, 7, 4, [1, 2, 4], []], + ) print("Test MRU cache: Success") @@ -198,22 +280,36 @@ def test_lfu_cache(): policies = [] policies.append(LFUPolicy()) # Access k4, miss. evict k2 - test_cache(policies, [3, 7, 4, [1, 3, 4], []]) + test_cache( + ThompsonSamplingCache(3, False, policies, cost_class_label=None), + [3, 7, 4, [1, 3, 4], []], + ) print("Test LFU cache: Success") def test_mix(cache): print("Test Mix {} cache".format(cache.cache_name())) n = 100000 - records = 199 + records = 100 + block_size_table = {} + trace_num_misses = 0 for i in range(n): key_id = random.randint(0, records) vs = random.randint(0, 10) + now = i * kMicrosInSecond + block_size = vs + if key_id in block_size_table: + block_size = block_size_table[key_id] + else: + block_size_table[key_id] = block_size + is_hit = key_id % 2 + if is_hit == 0: + trace_num_misses += 1 k = TraceRecord( - access_time=i, + access_time=now, block_id=key_id, block_type=1, - block_size=vs, + block_size=block_size, cf_id=0, cf_name="", level=0, @@ -223,13 +319,117 @@ def test_mix(cache): get_id=key_id, key_id=key_id, kv_size=5, - is_hit=1, + is_hit=is_hit, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=vs, ) cache.access(k) assert cache.miss_ratio_stats.miss_ratio() > 0 + if cache.cache_name() == "Trace": + assert cache.miss_ratio_stats.num_accesses == n + assert cache.miss_ratio_stats.num_misses == trace_num_misses + else: + assert cache.used_size <= cache.cache_size + all_values = cache.table.values() + cached_size = 0 + for value in all_values: + cached_size += value.value_size + assert cached_size == cache.used_size, "Expeced {} Actual {}".format( + cache.used_size, cached_size + ) print("Test Mix {} cache: Success".format(cache.cache_name())) +def test_end_to_end(): + print("Test All caches") + n = 100000 + nblocks = 1000 + block_size = 16 * 1024 + ncfs = 7 + nlevels = 6 + nfds = 100000 + trace_file_path = "test_trace" + # All blocks are of the same size so that OPT must achieve the lowest miss + # ratio. + with open(trace_file_path, "w+") as trace_file: + access_records = "" + for i in range(n): + key_id = random.randint(0, nblocks) + cf_id = random.randint(0, ncfs) + level = random.randint(0, nlevels) + fd = random.randint(0, nfds) + now = i * kMicrosInSecond + access_record = "" + access_record += "{},".format(now) + access_record += "{},".format(key_id) + access_record += "{},".format(9) # block type + access_record += "{},".format(block_size) # block size + access_record += "{},".format(cf_id) + access_record += "cf_{},".format(cf_id) + access_record += "{},".format(level) + access_record += "{},".format(fd) + access_record += "{},".format(key_id % 3) # caller + access_record += "{},".format(0) # no insert + access_record += "{},".format(i) # get_id + access_record += "{},".format(i) # key_id + access_record += "{},".format(100) # kv_size + access_record += "{},".format(1) # is_hit + access_record += "{},".format(1) # referenced_key_exist_in_block + access_record += "{},".format(10) # num_keys_in_block + access_record += "{},".format(1) # table_id + access_record += "{},".format(0) # seq_number + access_record += "{},".format(10) # block key size + access_record += "{},".format(20) # key size + access_record += "{},".format(0) # block offset + access_record = access_record[:-1] + access_records += access_record + "\n" + trace_file.write(access_records) + + print("Test All caches: Start testing caches") + cache_size = block_size * nblocks / 10 + downsample_size = 1 + cache_ms = {} + for cache_type in [ + "ts", + "opt", + "lru", + "pylru", + "linucb", + "gdsize", + "pyccbt", + "pycctbbt", + ]: + cache = create_cache(cache_type, cache_size, downsample_size) + run(trace_file_path, cache_type, cache, 0, -1, "all") + cache_ms[cache_type] = cache + assert cache.miss_ratio_stats.num_accesses == n + + for cache_type in cache_ms: + cache = cache_ms[cache_type] + ms = cache.miss_ratio_stats.miss_ratio() + assert ms <= 100.0 and ms >= 0.0 + # OPT should perform the best. + assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms + assert cache.used_size <= cache.cache_size + all_values = cache.table.values() + cached_size = 0 + for value in all_values: + cached_size += value.value_size + assert cached_size == cache.used_size, "Expeced {} Actual {}".format( + cache.used_size, cached_size + ) + print("Test All {}: Success".format(cache.cache_name())) + + os.remove(trace_file_path) + print("Test All: Success") + + def test_hybrid(cache): print("Test {} cache".format(cache.cache_name())) k = TraceRecord( @@ -247,6 +447,14 @@ def test_hybrid(cache): key_id=1, kv_size=0, # no size. is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, ) cache.access(k) # Expect a miss. # used size, num accesses, num misses, hash table size, blocks, get keys. @@ -319,22 +527,208 @@ def test_hybrid(cache): k.key_id = 4 # Same row key and should not be inserted again. k.kv_size = 1 cache.access(k) - assert_metrics(cache, [16, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]) + assert_metrics( + cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []] + ) print("Test {} cache: Success".format(cache.cache_name())) +def test_opt_cache(): + print("Test OPT cache") + cache = OPTCache(3) + # seq: 0, 1, 2, 3, 4, 5, 6, 7, 8 + # key: k1, k2, k3, k4, k5, k6, k7, k1, k8 + # next_access: 7, 19, 18, M, M, 17, 16, 25, M + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, # the first get request. + key_id=1, + kv_size=0, # no size. + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=7, + ) + cache.access(k) + assert_metrics( + cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 2 + k.next_access_seq_no = 19 + cache.access(k) + assert_metrics( + cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 3 + k.next_access_seq_no = 18 + cache.access(k) + assert_metrics( + cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 4 + k.next_access_seq_no = sys.maxsize # Never accessed again. + cache.access(k) + # Evict 2 since its next access 19 is the furthest in the future. + assert_metrics( + cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 5 + k.next_access_seq_no = sys.maxsize # Never accessed again. + cache.access(k) + # Evict 4 since its next access MAXINT is the furthest in the future. + assert_metrics( + cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 6 + k.next_access_seq_no = 17 + cache.access(k) + # Evict 5 since its next access MAXINT is the furthest in the future. + assert_metrics( + cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 7 + k.next_access_seq_no = 16 + cache.access(k) + # Evict 3 since its next access 18 is the furthest in the future. + assert_metrics( + cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 1 + k.next_access_seq_no = 25 + cache.access(k) + assert_metrics( + cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 8 + k.next_access_seq_no = sys.maxsize + cache.access(k) + # Evict 1 since its next access 25 is the furthest in the future. + assert_metrics( + cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False + ) + + # Insert a large kv pair to evict all keys. + k.access_time += 1 + k.block_id = 10 + k.block_size = 3 + k.next_access_seq_no = sys.maxsize + cache.access(k) + assert_metrics( + cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False + ) + print("Test OPT cache: Success") + + +def test_trace_cache(): + print("Test trace cache") + cache = TraceCache(0) + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=0, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=7, + ) + cache.access(k) + assert cache.miss_ratio_stats.num_accesses == 1 + assert cache.miss_ratio_stats.num_misses == 0 + k.is_hit = 0 + cache.access(k) + assert cache.miss_ratio_stats.num_accesses == 2 + assert cache.miss_ratio_stats.num_misses == 1 + print("Test trace cache: Success") + + if __name__ == "__main__": - policies = [] - policies.append(MRUPolicy()) - policies.append(LRUPolicy()) - policies.append(LFUPolicy()) test_hash_table() - test_lru_cache() + test_trace_cache() + test_opt_cache() + test_lru_cache( + ThompsonSamplingCache( + 3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None + ), + custom_hashtable=True, + ) + test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False) test_mru_cache() test_lfu_cache() - test_mix(ThompsonSamplingCache(100, False, policies)) - test_mix(ThompsonSamplingCache(100, True, policies)) - test_mix(LinUCBCache(100, False, policies)) - test_mix(LinUCBCache(100, True, policies)) - test_hybrid(ThompsonSamplingCache(kSampleSize, True, [LRUPolicy()])) - test_hybrid(LinUCBCache(kSampleSize, True, [LRUPolicy()])) + test_hybrid( + ThompsonSamplingCache( + kSampleSize, + enable_cache_row_key=1, + policies=[LRUPolicy()], + cost_class_label=None, + ) + ) + test_hybrid( + LinUCBCache( + kSampleSize, + enable_cache_row_key=1, + policies=[LRUPolicy()], + cost_class_label=None, + ) + ) + for cache_type in [ + "ts", + "opt", + "arc", + "pylfu", + "pymru", + "trace", + "pyhb", + "lru", + "pylru", + "linucb", + "gdsize", + "pycctbbt", + "pycctb", + "pyccbt", + ]: + for enable_row_cache in [0, 1, 2]: + cache_type_str = cache_type + if cache_type != "opt" and cache_type != "trace": + if enable_row_cache == 1: + cache_type_str += "_hybrid" + elif enable_row_cache == 2: + cache_type_str += "_hybridn" + test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1)) + test_end_to_end() diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc index 032ed2be24f..e1021b466c3 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -127,6 +127,9 @@ DEFINE_string(analyze_get_spatial_locality_labels, "", "Group data blocks using these labels."); DEFINE_string(analyze_get_spatial_locality_buckets, "", "Group data blocks by their statistics using these buckets."); +DEFINE_string(skew_labels, "", + "Group the access count of a block using these labels."); +DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets."); DEFINE_bool(mrc_only, false, "Evaluate alternative cache policies only. When this flag is true, " "the analyzer does NOT maintain states of each block in memory for " @@ -147,6 +150,7 @@ namespace { const std::string kMissRatioCurveFileName = "mrc"; const std::string kGroupbyBlock = "block"; +const std::string kGroupbyTable = "table"; const std::string kGroupbyColumnFamily = "cf"; const std::string kGroupbySSTFile = "sst"; const std::string kGroupbyBlockType = "bt"; @@ -164,6 +168,7 @@ const std::string kSupportedCacheNames = // The suffix for the generated csv files. const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline"; const std::string kFileNameSuffixMissTimeline = "miss_timeline"; +const std::string kFileNameSuffixSkew = "skewness"; const std::string kFileNameSuffixAccessTimeline = "access_timeline"; const std::string kFileNameSuffixCorrelation = "correlation_input"; const std::string kFileNameSuffixAvgReuseIntervalNaccesses = @@ -540,6 +545,62 @@ void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const { } } +void BlockCacheTraceAnalyzer::WriteSkewness( + const std::string& label_str, const std::vector& percent_buckets, + TraceType target_block_type) const { + std::set labels = ParseLabelStr(label_str); + std::map label_naccesses; + uint64_t total_naccesses = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + if (target_block_type != TraceType::kTraceMax && + target_block_type != type) { + return; + } + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); + label_naccesses[label] += block.num_accesses; + total_naccesses += block.num_accesses; + }; + TraverseBlocks(block_callback, &labels); + std::map> label_bucket_naccesses; + std::vector> pairs; + for (auto const& itr : label_naccesses) { + pairs.push_back(itr); + } + // Sort in descending order. + sort( + pairs.begin(), pairs.end(), + [=](std::pair& a, + std::pair& b) { return b.second < a.second; }); + + size_t prev_start_index = 0; + for (auto const& percent : percent_buckets) { + label_bucket_naccesses[label_str][percent] = 0; + size_t end_index = 0; + if (percent == port::kMaxUint64) { + end_index = label_naccesses.size(); + } else { + end_index = percent * label_naccesses.size() / 100; + } + for (size_t i = prev_start_index; i < end_index; i++) { + label_bucket_naccesses[label_str][percent] += pairs[i].second; + } + prev_start_index = end_index; + } + std::string filename_suffix; + if (target_block_type != TraceType::kTraceMax) { + filename_suffix = block_type_to_string(target_block_type); + filename_suffix += "_"; + } + filename_suffix += kFileNameSuffixSkew; + WriteStatsToFile(label_str, percent_buckets, filename_suffix, + label_bucket_naccesses, total_naccesses); +} + void BlockCacheTraceAnalyzer::WriteCorrelationFeatures( const std::string& label_str, uint32_t max_number_of_values) const { std::set labels = ParseLabelStr(label_str); @@ -549,12 +610,16 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeatures( [&](const std::string& cf_name, uint64_t fd, uint32_t level, TraceType block_type, const std::string& /*block_key*/, uint64_t /*block_key_id*/, const BlockAccessInfo& block) { + if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) { + // We only know table id information for get requests. + return; + } if (labels.find(kGroupbyCaller) != labels.end()) { // Group by caller. for (auto const& caller_map : block.caller_access_timeline) { const std::string label = BuildLabel(labels, cf_name, fd, level, block_type, - caller_map.first, /*block_id=*/0); + caller_map.first, /*block_id=*/0, block); auto it = block.caller_access_sequence__number_timeline.find( caller_map.first); assert(it != block.caller_access_sequence__number_timeline.end()); @@ -563,14 +628,15 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeatures( } return; } - const std::string label = BuildLabel( - labels, cf_name, fd, level, block_type, - TableReaderCaller::kMaxBlockCacheLookupCaller, /*block_id=*/0); + const std::string label = + BuildLabel(labels, cf_name, fd, level, block_type, + TableReaderCaller::kMaxBlockCacheLookupCaller, + /*block_id=*/0, block); UpdateFeatureVectors(block.access_sequence_number_timeline, block.access_timeline, label, &label_features, &label_predictions); }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions, max_number_of_values); } @@ -656,7 +722,7 @@ std::set BlockCacheTraceAnalyzer::ParseLabelStr( std::string BlockCacheTraceAnalyzer::BuildLabel( const std::set& labels, const std::string& cf_name, uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller, - uint64_t block_key) const { + uint64_t block_key, const BlockAccessInfo& block) const { std::map label_value_map; label_value_map[kGroupbyAll] = kGroupbyAll; label_value_map[kGroupbyLevel] = std::to_string(level); @@ -665,6 +731,7 @@ std::string BlockCacheTraceAnalyzer::BuildLabel( label_value_map[kGroupbyBlockType] = block_type_to_string(type); label_value_map[kGroupbyColumnFamily] = cf_name; label_value_map[kGroupbyBlock] = std::to_string(block_key); + label_value_map[kGroupbyTable] = std::to_string(block.table_id); // Concatenate the label values. std::string label; for (auto const& l : labels) { @@ -683,7 +750,8 @@ void BlockCacheTraceAnalyzer::TraverseBlocks( const std::string& /*block_key*/, uint64_t /*block_key_id*/, const BlockAccessInfo& /*block_access_info*/)> - block_callback) const { + block_callback, + std::set* labels) const { for (auto const& cf_aggregates : cf_aggregates_map_) { // Stats per column family. const std::string& cf_name = cf_aggregates.first; @@ -698,6 +766,11 @@ void BlockCacheTraceAnalyzer::TraverseBlocks( for (auto const& block_access_info : block_type_aggregates.second.block_access_info_map) { // Stats per block. + if (labels && block_access_info.second.table_id == 0 && + labels->find(kGroupbyTable) != labels->end()) { + // We only know table id information for get requests. + return; + } block_callback(cf_name, fd, level, type, block_access_info.first, block_access_info.second.block_id, block_access_info.second); @@ -733,7 +806,7 @@ void BlockCacheTraceAnalyzer::WriteGetSpatialLocality( } const std::string label = BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock, - TableReaderCaller::kUserGet, /*block_id=*/0); + TableReaderCaller::kUserGet, /*block_id=*/0, block); const uint64_t percent_referenced_for_existing_keys = static_cast(std::max( @@ -761,7 +834,7 @@ void BlockCacheTraceAnalyzer::WriteGetSpatialLocality( ->second += 1; nblocks += 1; }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys, label_pnrefkeys_nblocks, nblocks); WriteStatsToFile(label_str, percent_buckets, @@ -792,7 +865,7 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str, continue; } const std::string label = - BuildLabel(labels, cf_name, fd, level, type, caller, block_id); + BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block); for (auto const& naccess : timeline.second) { const uint64_t timestamp = naccess.first / time_unit; const uint64_t num = naccess.second; @@ -806,7 +879,7 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str, access_count_block_id_map[naccesses].push_back(std::to_string(block_id)); } }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); // We have label_access_timeline now. Write them into a file. const std::string user_access_prefix = @@ -877,9 +950,9 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance( uint32_t level, TraceType type, const std::string& /*block_key*/, uint64_t block_id, const BlockAccessInfo& block) { - const std::string label = - BuildLabel(labels, cf_name, fd, level, type, - TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); if (label_distance_num_reuses.find(label) == label_distance_num_reuses.end()) { // The first time we encounter this label. @@ -894,7 +967,7 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance( total_num_reuses += reuse_distance.second; } }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); // We have label_naccesses and label_distance_num_reuses now. Write them into // a file. const std::string output_path = @@ -1016,17 +1089,17 @@ void BlockCacheTraceAnalyzer::WriteReuseInterval( if (labels.find(kGroupbyCaller) != labels.end()) { for (auto const& timeline : block.caller_num_accesses_timeline) { const TableReaderCaller caller = timeline.first; - const std::string label = - BuildLabel(labels, cf_name, fd, level, type, caller, block_id); + const std::string label = BuildLabel(labels, cf_name, fd, level, type, + caller, block_id, block); UpdateReuseIntervalStats(label, time_buckets, timeline.second, &label_time_num_reuses, &total_num_reuses); } return; } // Does not group by caller so we need to flatten the access timeline. - const std::string label = - BuildLabel(labels, cf_name, fd, level, type, - TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); std::map timeline; for (auto const& caller_timeline : block.caller_num_accesses_timeline) { for (auto const& time_naccess : caller_timeline.second) { @@ -1045,7 +1118,7 @@ void BlockCacheTraceAnalyzer::WriteReuseInterval( label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second += block.num_accesses; }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); // Write the stats into files. WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval, @@ -1074,9 +1147,9 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime( } else { lifetime = port::kMaxUint64 - 1; } - const std::string label = - BuildLabel(labels, cf_name, fd, level, type, - TableReaderCaller::kMaxBlockCacheLookupCaller, block_id); + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) { // The first time we encounter this label. @@ -1087,7 +1160,7 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime( label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1; total_nblocks += 1; }; - TraverseBlocks(block_callback); + TraverseBlocks(block_callback, &labels); WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime, label_lifetime_nblocks, total_nblocks); } @@ -1396,11 +1469,17 @@ Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord( int ret = snprintf( trace_record_buffer_, sizeof(trace_record_buffer_), "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32 - ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n", + ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64 + ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", access.access_timestamp, block_id, access.block_type, access.block_size, access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number, access.caller, access.no_insert, access.get_id, get_key_id, - access.referenced_data_size, access.is_cache_hit); + access.referenced_data_size, access.is_cache_hit, + access.referenced_key_exist_in_block, access.num_keys_in_block, + BlockCacheTraceHelper::GetTableId(access), + BlockCacheTraceHelper::GetSequenceNumber(access), access.block_key.size(), + access.referenced_key.size(), + BlockCacheTraceHelper::GetBlockOffsetInFile(access)); if (ret < 0) { return Status::IOError("failed to format the output"); } @@ -1432,13 +1511,13 @@ Status BlockCacheTraceAnalyzer::RecordAccess( uint64_t get_key_id = 0; if (access.caller == TableReaderCaller::kUserGet && access.get_id != BlockCacheTraceHelper::kReservedGetId) { - std::string row_key = BlockCacheTraceHelper::ComputeRowKey(access); - if (get_key_info_map_.find(row_key) == get_key_info_map_.end()) { - get_key_info_map_[row_key].key_id = unique_get_key_id_; - get_key_id = unique_get_key_id_; + std::string user_key = ExtractUserKey(access.referenced_key).ToString(); + if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) { + get_key_info_map_[user_key].key_id = unique_get_key_id_; unique_get_key_id_++; } - get_key_info_map_[row_key].AddAccess(access, access_sequence_number_); + get_key_id = get_key_info_map_[user_key].key_id; + get_key_info_map_[user_key].AddAccess(access, access_sequence_number_); } if (compute_reuse_distance_) { @@ -2224,6 +2303,25 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.WriteCorrelationFeaturesForGet( FLAGS_analyze_correlation_coefficients_max_number_of_values); } + + if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) { + std::vector buckets = parse_buckets(FLAGS_skew_buckets); + std::stringstream ss(FLAGS_skew_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + if (label.find("block") != std::string::npos) { + analyzer.WriteSkewness(label, buckets, + TraceType::kBlockTraceIndexBlock); + analyzer.WriteSkewness(label, buckets, + TraceType::kBlockTraceFilterBlock); + analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock); + analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax); + } else { + analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax); + } + } + } return 0; } diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h index bc41ff468cc..f22a9da68f3 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -33,6 +33,8 @@ struct GetKeyInfo { // Statistics of a block. struct BlockAccessInfo { uint64_t block_id = 0; + uint64_t table_id = 0; + uint64_t block_offset = 0; uint64_t num_accesses = 0; uint64_t block_size = 0; uint64_t first_access_time = 0; @@ -73,6 +75,8 @@ struct BlockAccessInfo { if (first_access_time == 0) { first_access_time = access.access_timestamp; } + table_id = BlockCacheTraceHelper::GetTableId(access); + block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access); last_access_time = access.access_timestamp; block_size = access.block_size; caller_num_access_map[access.caller]++; @@ -301,6 +305,10 @@ class BlockCacheTraceAnalyzer { void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const; + void WriteSkewness(const std::string& label_str, + const std::vector& percent_buckets, + TraceType target_block_type) const; + const std::map& TEST_cf_aggregates_map() const { return cf_aggregates_map_; @@ -312,7 +320,8 @@ class BlockCacheTraceAnalyzer { std::string BuildLabel(const std::set& labels, const std::string& cf_name, uint64_t fd, uint32_t level, TraceType type, - TableReaderCaller caller, uint64_t block_key) const; + TableReaderCaller caller, uint64_t block_key, + const BlockAccessInfo& block) const; void ComputeReuseDistance(BlockAccessInfo* info) const; @@ -341,7 +350,8 @@ class BlockCacheTraceAnalyzer { const std::string& /*block_key*/, uint64_t /*block_key_id*/, const BlockAccessInfo& /*block_access_info*/)> - block_callback) const; + block_callback, + std::set* labels = nullptr) const; void UpdateFeatureVectors( const std::vector& access_sequence_number_timeline, diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index 9917d5b9e78..eecd6e80d9d 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -181,7 +181,9 @@ class BlockCacheTracerTest : public testing::Test { analyze_get_spatial_locality_labels_, "-analyze_get_spatial_locality_buckets=" + analyze_get_spatial_locality_buckets_, - "-analyze_correlation_coefficients_labels=all"}; + "-analyze_correlation_coefficients_labels=all", + "-skew_labels=all", + "-skew_buckets=10,50,100"}; char arg_buffer[kArgBufferSize]; char* argv[kMaxArgCount]; int argc = 0; @@ -331,6 +333,33 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } } } + { + // Validate the skewness csv file. + const std::string skewness_file_path = test_path_ + "/all_skewness"; + std::ifstream skew_file(skewness_file_path); + // Read header. + std::string line; + ASSERT_TRUE(getline(skew_file, line)); + std::stringstream ss(line); + double sum_percent = 0; + while (getline(skew_file, line)) { + std::stringstream ss_naccess(line); + std::string substr; + bool read_label = false; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!read_label) { + read_label = true; + continue; + } + sum_percent += ParseDouble(substr); + } + } + ASSERT_EQ(100.0, sum_percent); + ASSERT_FALSE(getline(skew_file, line)); + skew_file.close(); + ASSERT_OK(env_->DeleteFile(skewness_file_path)); + } { // Validate the timeline csv files. const std::vector time_units{"_60", "_3600"}; diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 1eeb64ac85d..4f39be609fe 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -61,11 +61,40 @@ std::string BlockCacheTraceHelper::ComputeRowKey( return ""; } Slice key = ExtractUserKey(access.referenced_key); - uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse - ? 0 - : 1 + GetInternalKeySeqno(access.referenced_key); - return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" + - std::to_string(seq_no); + return std::to_string(access.sst_fd_number) + "_" + key.ToString(); +} + +uint64_t BlockCacheTraceHelper::GetTableId( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller) || access.referenced_key.size() < 4) { + return 0; + } + return static_cast(DecodeFixed32(access.referenced_key.data())) + 1; +} + +uint64_t BlockCacheTraceHelper::GetSequenceNumber( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller)) { + return 0; + } + return access.get_from_user_specified_snapshot == Boolean::kFalse + ? 0 + : 1 + GetInternalKeySeqno(access.referenced_key); +} + +uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile( + const BlockCacheTraceRecord& access) { + Slice input(access.block_key); + uint64_t offset = 0; + while (true) { + uint64_t tmp = 0; + if (GetVarint64(&input, &tmp)) { + offset = tmp; + } else { + break; + } + } + return offset; } BlockCacheTraceWriter::BlockCacheTraceWriter( diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 3863ca430a4..b109b1db01c 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -31,6 +31,15 @@ class BlockCacheTraceHelper { // Row key is a concatenation of the access's fd_number and the referenced // user key. static std::string ComputeRowKey(const BlockCacheTraceRecord& access); + // The first four bytes of the referenced key in a Get request is the table + // id. + static uint64_t GetTableId(const BlockCacheTraceRecord& access); + // The sequence number of a get request is the last part of the referenced + // key. + static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access); + // Block offset in a file is the last varint64 in the block key. + static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access); + static const std::string kUnknownColumnFamilyName; static const uint64_t kReservedGetId; }; diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc index babdd431f5a..3d3432e20a4 100644 --- a/utilities/simulator_cache/cache_simulator_test.cc +++ b/utilities/simulator_cache/cache_simulator_test.cc @@ -84,7 +84,7 @@ class CacheSimulatorTest : public testing::Test { for (auto const& key : keys) { std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber; auto handle = - sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString() + "_0"); + sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString()); EXPECT_NE(nullptr, handle); sim_cache->Release(handle); } @@ -229,10 +229,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses()); ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio()); - auto handle = sim_cache->Lookup( - std::to_string(first_get.sst_fd_number) + "_" + - ExtractUserKey(first_get.referenced_key).ToString() + "_" + - std::to_string(1 + GetInternalKeySeqno(first_get.referenced_key))); + auto handle = + sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString()); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); for (uint32_t i = 100; i < block_id; i++) { @@ -256,10 +255,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses()); ASSERT_EQ(66, static_cast( cache_simulator->miss_ratio_stats().user_miss_ratio())); - handle = sim_cache->Lookup( - std::to_string(second_get.sst_fd_number) + "_" + - ExtractUserKey(second_get.referenced_key).ToString() + "_" + - std::to_string(1 + GetInternalKeySeqno(second_get.referenced_key))); + handle = + sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" + + ExtractUserKey(second_get.referenced_key).ToString()); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); for (uint32_t i = 100; i < block_id; i++) { @@ -394,7 +392,7 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) { AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4, {"1", "2", "3", "5"}, {"1", "2", "4"}); for (auto const& key : {"1", "2", "4"}) { - auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0"); + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); } @@ -417,7 +415,7 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) { AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {}, {}); for (auto const& key : {"1", "2", "4"}) { - auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0"); + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key); ASSERT_EQ(nullptr, handle); } } @@ -437,9 +435,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) { cache_simulator->Access(first_get); block_id++; } - auto handle = sim_cache->Lookup( - std::to_string(first_get.sst_fd_number) + "_" + - ExtractUserKey(first_get.referenced_key).ToString() + "_0"); + auto handle = + sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString()); ASSERT_NE(nullptr, handle); sim_cache->Release(handle); // All blocks are missing from the cache since insert_blocks_row_kvpair_misses From 38b03c840e0ac49cffc7f5a667c6bc910648b9a1 Mon Sep 17 00:00:00 2001 From: Aaryaman Sagar Date: Wed, 7 Aug 2019 14:29:35 -0700 Subject: [PATCH 285/572] Port folly/synchronization/DistributedMutex to rocksdb (#5642) Summary: This ports `folly::DistributedMutex` into RocksDB. The PR includes everything else needed to compile and use DistributedMutex as a component within folly. Most files are unchanged except for some portability stuff and includes. For now, I've put this under `rocksdb/third-party`, but if there is a better folder to put this under, let me know. I also am not sure how or where to put unit tests for third-party stuff like this. It seems like gtest is included already, but I need to link with it from another third-party folder. This also includes some other common components from folly - folly/Optional - folly/ScopeGuard (In particular `SCOPE_EXIT`) - folly/synchronization/ParkingLot (A portable futex-like interface) - folly/synchronization/AtomicNotification (The standard C++ interface for futexes) - folly/Indestructible (For singletons that don't get destroyed without allocations) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5642 Differential Revision: D16544439 fbshipit-source-id: 179b98b5dcddc3075926d31a30f92fd064245731 --- CMakeLists.txt | 23 + Makefile | 30 +- build_tools/build_detect_platform | 6 + build_tools/fbcode_config.sh | 2 + build_tools/fbcode_config_platform007.sh | 2 + src.mk | 7 + third-party/folly/folly/CPortability.h | 15 + third-party/folly/folly/ConstexprMath.h | 17 + third-party/folly/folly/Indestructible.h | 166 ++ third-party/folly/folly/Optional.h | 570 ++++++ third-party/folly/folly/Portability.h | 74 + third-party/folly/folly/ScopeGuard.h | 54 + third-party/folly/folly/Traits.h | 152 ++ third-party/folly/folly/Unit.h | 59 + third-party/folly/folly/Utility.h | 141 ++ third-party/folly/folly/chrono/Hardware.h | 33 + third-party/folly/folly/container/Array.h | 74 + third-party/folly/folly/detail/Futex-inl.h | 117 ++ third-party/folly/folly/detail/Futex.cpp | 263 +++ third-party/folly/folly/detail/Futex.h | 96 + third-party/folly/folly/functional/Invoke.h | 40 + third-party/folly/folly/hash/Hash.h | 29 + third-party/folly/folly/lang/Align.h | 38 + third-party/folly/folly/lang/Bits.h | 30 + third-party/folly/folly/lang/Launder.h | 51 + third-party/folly/folly/portability/Asm.h | 28 + .../folly/folly/portability/SysSyscall.h | 10 + .../folly/folly/portability/SysTypes.h | 26 + .../synchronization/AtomicNotification-inl.h | 138 ++ .../synchronization/AtomicNotification.cpp | 23 + .../synchronization/AtomicNotification.h | 57 + .../folly/synchronization/AtomicUtil-inl.h | 258 +++ .../folly/folly/synchronization/AtomicUtil.h | 52 + .../folly/folly/synchronization/Baton.h | 327 ++++ .../synchronization/DistributedMutex-inl.h | 1702 +++++++++++++++++ .../synchronization/DistributedMutex.cpp | 16 + .../folly/synchronization/DistributedMutex.h | 304 +++ .../DistributedMutexSpecializations.h | 39 + .../folly/synchronization/ParkingLot.cpp | 26 + .../folly/folly/synchronization/ParkingLot.h | 318 +++ .../folly/synchronization/WaitOptions.cpp | 12 + .../folly/folly/synchronization/WaitOptions.h | 57 + .../detail/InlineFunctionRef.h | 219 +++ .../detail/ProxyLockable-inl.h | 207 ++ .../synchronization/detail/ProxyLockable.h | 164 ++ .../folly/synchronization/detail/Sleeper.h | 57 + .../folly/folly/synchronization/detail/Spin.h | 77 + .../test/DistributedMutexTest.cpp | 1130 +++++++++++ 48 files changed, 7335 insertions(+), 1 deletion(-) create mode 100644 third-party/folly/folly/CPortability.h create mode 100644 third-party/folly/folly/ConstexprMath.h create mode 100644 third-party/folly/folly/Indestructible.h create mode 100644 third-party/folly/folly/Optional.h create mode 100644 third-party/folly/folly/Portability.h create mode 100644 third-party/folly/folly/ScopeGuard.h create mode 100644 third-party/folly/folly/Traits.h create mode 100644 third-party/folly/folly/Unit.h create mode 100644 third-party/folly/folly/Utility.h create mode 100644 third-party/folly/folly/chrono/Hardware.h create mode 100644 third-party/folly/folly/container/Array.h create mode 100644 third-party/folly/folly/detail/Futex-inl.h create mode 100644 third-party/folly/folly/detail/Futex.cpp create mode 100644 third-party/folly/folly/detail/Futex.h create mode 100644 third-party/folly/folly/functional/Invoke.h create mode 100644 third-party/folly/folly/hash/Hash.h create mode 100644 third-party/folly/folly/lang/Align.h create mode 100644 third-party/folly/folly/lang/Bits.h create mode 100644 third-party/folly/folly/lang/Launder.h create mode 100644 third-party/folly/folly/portability/Asm.h create mode 100644 third-party/folly/folly/portability/SysSyscall.h create mode 100644 third-party/folly/folly/portability/SysTypes.h create mode 100644 third-party/folly/folly/synchronization/AtomicNotification-inl.h create mode 100644 third-party/folly/folly/synchronization/AtomicNotification.cpp create mode 100644 third-party/folly/folly/synchronization/AtomicNotification.h create mode 100644 third-party/folly/folly/synchronization/AtomicUtil-inl.h create mode 100644 third-party/folly/folly/synchronization/AtomicUtil.h create mode 100644 third-party/folly/folly/synchronization/Baton.h create mode 100644 third-party/folly/folly/synchronization/DistributedMutex-inl.h create mode 100644 third-party/folly/folly/synchronization/DistributedMutex.cpp create mode 100644 third-party/folly/folly/synchronization/DistributedMutex.h create mode 100644 third-party/folly/folly/synchronization/DistributedMutexSpecializations.h create mode 100644 third-party/folly/folly/synchronization/ParkingLot.cpp create mode 100644 third-party/folly/folly/synchronization/ParkingLot.h create mode 100644 third-party/folly/folly/synchronization/WaitOptions.cpp create mode 100644 third-party/folly/folly/synchronization/WaitOptions.h create mode 100644 third-party/folly/folly/synchronization/detail/InlineFunctionRef.h create mode 100644 third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h create mode 100644 third-party/folly/folly/synchronization/detail/ProxyLockable.h create mode 100644 third-party/folly/folly/synchronization/detail/Sleeper.h create mode 100644 third-party/folly/folly/synchronization/detail/Spin.h create mode 100644 third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8622242aa75..f81e0ca4f99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,13 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, if (WITH_WINDOWS_UTF8_FILENAMES) add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES) endif() +# third-party/folly is only validated to work on Linux and Windows for now. +# So only turn it on there by default. +if(CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows") + option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON) +else() + option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF) +endif() if(MSVC) # Defaults currently different for GFLAGS. # We will address find_package work a little later @@ -462,6 +469,9 @@ endif() include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) +if(WITH_FOLLY_DISTRIBUTED_MUTEX) + include_directories(${PROJECT_SOURCE_DIR}/third-party/folly) +endif() find_package(Threads REQUIRED) # Main library source code @@ -738,6 +748,15 @@ else() env/io_posix.cc) endif() +if(WITH_FOLLY_DISTRIBUTED_MUTEX) + list(APPEND SOURCES + third-party/folly/folly/detail/Futex.cpp + third-party/folly/folly/synchronization/AtomicNotification.cpp + third-party/folly/folly/synchronization/DistributedMutex.cpp + third-party/folly/folly/synchronization/ParkingLot.cpp + third-party/folly/folly/synchronization/WaitOptions.cpp) +endif() + set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX}) set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX}) set(ROCKSDB_IMPORT_LIB ${ROCKSDB_SHARED_LIB}) @@ -1009,6 +1028,10 @@ if(WITH_TESTS) list(APPEND TESTS utilities/env_librados_test.cc) endif() + if(WITH_FOLLY_DISTRIBUTED_MUTEX) + list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp) + endif() + set(BENCHMARKS cache/cache_bench.cc memtable/memtablerep_bench.cc diff --git a/Makefile b/Makefile index 1718309cb89..ccca3ac5efb 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker) ifneq ($(DEBUG_LEVEL),2) - DEBUG_LEVEL=0 + DEBUG_LEVEL=0 endif endif @@ -304,6 +304,10 @@ ifndef DISABLE_JEMALLOC PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) endif +ifndef USE_FOLLY_DISTRIBUTED_MUTEX + USE_FOLLY_DISTRIBUTED_MUTEX=0 +endif + export GTEST_THROW_ON_FAILURE=1 export GTEST_HAS_EXCEPTIONS=1 GTEST_DIR = ./third-party/gtest-1.7.0/fused-src @@ -316,6 +320,18 @@ else PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR) endif +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + FOLLY_DIR = ./third-party/folly + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(FOLLY_DIR) + PLATFORM_CXXFLAGS += -I$(FOLLY_DIR) + else + PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR) + PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR) + endif +endif + # This (the first rule) must depend on "all". default: all @@ -402,6 +418,9 @@ endif LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o) MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o) +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o) +endif GTEST = $(GTEST_DIR)/gtest/gtest-all.o TESTUTIL = ./test_util/testutil.o @@ -569,6 +588,10 @@ TESTS = \ block_cache_tracer_test \ block_cache_trace_analyzer_test \ +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + TESTS += folly_synchronization_distributed_mutex_test +endif + PARALLEL_TEST = \ backupable_db_test \ db_bloom_filter_test \ @@ -1120,6 +1143,11 @@ trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) $(AM_LINK) +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) +folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o + $(AM_LINK) +endif + cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 4a52c6cddb7..7b18a5d5f59 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -150,6 +150,9 @@ case "$TARGET_OS" in PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then + USE_FOLLY_DISTRIBUTED_MUTEX=1 + fi # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) @@ -661,3 +664,6 @@ if test -n "$WITH_JEMALLOC_FLAG"; then echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT" fi echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" +if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then + echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT" +fi diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 4415f87da38..c2c39db48fe 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -159,4 +159,6 @@ else LUA_LIB=" $LUA_PATH/lib/liblua_pic.a" fi +USE_FOLLY_DISTRIBUTED_MUTEX=1 + export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB diff --git a/build_tools/fbcode_config_platform007.sh b/build_tools/fbcode_config_platform007.sh index 1a1e4208139..9da23fd843f 100644 --- a/build_tools/fbcode_config_platform007.sh +++ b/build_tools/fbcode_config_platform007.sh @@ -155,4 +155,6 @@ VALGRIND_VER="$VALGRIND_BASE/bin/" LUA_PATH= LUA_LIB= +USE_FOLLY_DISTRIBUTED_MUTEX=1 + export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB diff --git a/src.mk b/src.mk index 6d1d655c7f0..8ebc0bee96c 100644 --- a/src.mk +++ b/src.mk @@ -263,6 +263,13 @@ TEST_LIB_SOURCES = \ test_util/testutil.cc \ utilities/cassandra/test_utils.cc \ +FOLLY_SOURCES = \ + third-party/folly/folly/detail/Futex.cpp \ + third-party/folly/folly/synchronization/AtomicNotification.cpp \ + third-party/folly/folly/synchronization/DistributedMutex.cpp \ + third-party/folly/folly/synchronization/ParkingLot.cpp \ + third-party/folly/folly/synchronization/WaitOptions.cpp \ + MAIN_SOURCES = \ cache/cache_bench.cc \ cache/cache_test.cc \ diff --git a/third-party/folly/folly/CPortability.h b/third-party/folly/folly/CPortability.h new file mode 100644 index 00000000000..3ce3a7785ac --- /dev/null +++ b/third-party/folly/folly/CPortability.h @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +/** + * Macro for marking functions as having public visibility. + */ +#if defined(__GNUC__) +#define FOLLY_EXPORT __attribute__((__visibility__("default"))) +#else +#define FOLLY_EXPORT +#endif diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h new file mode 100644 index 00000000000..b125c5f423b --- /dev/null +++ b/third-party/folly/folly/ConstexprMath.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace folly { +template +constexpr T constexpr_max(T a) { + return a; +} +template +constexpr T constexpr_max(T a, T b, Ts... ts) { + return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...); +} +} // namespace folly diff --git a/third-party/folly/folly/Indestructible.h b/third-party/folly/folly/Indestructible.h new file mode 100644 index 00000000000..68249d86512 --- /dev/null +++ b/third-party/folly/folly/Indestructible.h @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include + +namespace folly { + +/*** + * Indestructible + * + * When you need a Meyers singleton that will not get destructed, even at + * shutdown, and you also want the object stored inline. + * + * Use like: + * + * void doSomethingWithExpensiveData(); + * + * void doSomethingWithExpensiveData() { + * static const Indestructible> data{ + * map{{"key1", 17}, {"key2", 19}, {"key3", 23}}, + * }; + * callSomethingTakingAMapByRef(*data); + * } + * + * This should be used only for Meyers singletons, and, even then, only when + * the instance does not need to be destructed ever. + * + * This should not be used more generally, e.g., as member fields, etc. + * + * This is designed as an alternative, but with one fewer allocation at + * construction time and one fewer pointer dereference at access time, to the + * Meyers singleton pattern of: + * + * void doSomethingWithExpensiveData() { + * static const auto data = // never `delete`d + * new map{{"key1", 17}, {"key2", 19}, {"key3", 23}}; + * callSomethingTakingAMapByRef(*data); + * } + */ + +template +class Indestructible final { + public: + template + constexpr Indestructible() noexcept(noexcept(T())) {} + + /** + * Constructor accepting a single argument by forwarding reference, this + * allows using list initialzation without the overhead of things like + * in_place, etc and also works with std::initializer_list constructors + * which can't be deduced, the default parameter helps there. + * + * auto i = folly::Indestructible>{{{1, 2}}}; + * + * This provides convenience + * + * There are two versions of this constructor - one for when the element is + * implicitly constructible from the given argument and one for when the + * type is explicitly but not implicitly constructible from the given + * argument. + */ + template < + typename U = T, + _t::value>>* = nullptr, + _t, remove_cvref_t>::value>>* = + nullptr, + _t::value>>* = nullptr> + explicit constexpr Indestructible(U&& u) noexcept( + noexcept(T(std::declval()))) + : storage_(std::forward(u)) {} + template < + typename U = T, + _t::value>>* = nullptr, + _t, remove_cvref_t>::value>>* = + nullptr, + _t::value>>* = nullptr> + /* implicit */ constexpr Indestructible(U&& u) noexcept( + noexcept(T(std::declval()))) + : storage_(std::forward(u)) {} + + template ()...))> + explicit constexpr Indestructible(Args&&... args) noexcept( + noexcept(T(std::declval()...))) + : storage_(std::forward(args)...) {} + template < + typename U, + typename... Args, + typename = decltype( + T(std::declval&>(), + std::declval()...))> + explicit constexpr Indestructible(std::initializer_list il, Args... args) noexcept( + noexcept( + T(std::declval&>(), + std::declval()...))) + : storage_(il, std::forward(args)...) {} + + ~Indestructible() = default; + + Indestructible(Indestructible const&) = delete; + Indestructible& operator=(Indestructible const&) = delete; + + Indestructible(Indestructible&& other) noexcept( + noexcept(T(std::declval()))) + : storage_(std::move(other.storage_.value)) { + other.erased_ = true; + } + Indestructible& operator=(Indestructible&& other) noexcept( + noexcept(T(std::declval()))) { + storage_.value = std::move(other.storage_.value); + other.erased_ = true; + } + + T* get() noexcept { + check(); + return &storage_.value; + } + T const* get() const noexcept { + check(); + return &storage_.value; + } + T& operator*() noexcept { + return *get(); + } + T const& operator*() const noexcept { + return *get(); + } + T* operator->() noexcept { + return get(); + } + T const* operator->() const noexcept { + return get(); + } + + private: + void check() const noexcept { + assert(!erased_); + } + + union Storage { + T value; + + template + constexpr Storage() noexcept(noexcept(T())) : value() {} + + template ()...))> + explicit constexpr Storage(Args&&... args) noexcept( + noexcept(T(std::declval()...))) + : value(std::forward(args)...) {} + + ~Storage() {} + }; + + Storage storage_{}; + bool erased_{false}; +}; +} // namespace folly diff --git a/third-party/folly/folly/Optional.h b/third-party/folly/folly/Optional.h new file mode 100644 index 00000000000..ee12467dda7 --- /dev/null +++ b/third-party/folly/folly/Optional.h @@ -0,0 +1,570 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +/* + * Optional - For conditional initialization of values, like boost::optional, + * but with support for move semantics and emplacement. Reference type support + * has not been included due to limited use cases and potential confusion with + * semantics of assignment: Assigning to an optional reference could quite + * reasonably copy its value or redirect the reference. + * + * Optional can be useful when a variable might or might not be needed: + * + * Optional maybeLogger = ...; + * if (maybeLogger) { + * maybeLogger->log("hello"); + * } + * + * Optional enables a 'null' value for types which do not otherwise have + * nullability, especially useful for parameter passing: + * + * void testIterator(const unique_ptr& it, + * initializer_list idsExpected, + * Optional> ranksExpected = none) { + * for (int i = 0; it->next(); ++i) { + * EXPECT_EQ(it->doc().id(), idsExpected[i]); + * if (ranksExpected) { + * EXPECT_EQ(it->doc().rank(), (*ranksExpected)[i]); + * } + * } + * } + * + * Optional models OptionalPointee, so calling 'get_pointer(opt)' will return a + * pointer to nullptr if the 'opt' is empty, and a pointer to the value if it is + * not: + * + * Optional maybeInt = ...; + * if (int* v = get_pointer(maybeInt)) { + * cout << *v << endl; + * } + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace folly { + +template +class Optional; + +namespace detail { +template +struct OptionalPromiseReturn; +} // namespace detail + +struct None { + enum class _secret { _token }; + + /** + * No default constructor to support both `op = {}` and `op = none` + * as syntax for clearing an Optional, just like std::nullopt_t. + */ + constexpr explicit None(_secret) {} +}; +constexpr None none{None::_secret::_token}; + +class FOLLY_EXPORT OptionalEmptyException : public std::runtime_error { + public: + OptionalEmptyException() + : std::runtime_error("Empty Optional cannot be unwrapped") {} +}; + +template +class Optional { + public: + typedef Value value_type; + + static_assert( + !std::is_reference::value, + "Optional may not be used with reference types"); + static_assert( + !std::is_abstract::value, + "Optional may not be used with abstract types"); + + Optional() noexcept {} + + Optional(const Optional& src) noexcept( + std::is_nothrow_copy_constructible::value) { + if (src.hasValue()) { + construct(src.value()); + } + } + + Optional(Optional&& src) noexcept( + std::is_nothrow_move_constructible::value) { + if (src.hasValue()) { + construct(std::move(src.value())); + src.clear(); + } + } + + /* implicit */ Optional(const None&) noexcept {} + + /* implicit */ Optional(Value&& newValue) noexcept( + std::is_nothrow_move_constructible::value) { + construct(std::move(newValue)); + } + + /* implicit */ Optional(const Value& newValue) noexcept( + std::is_nothrow_copy_constructible::value) { + construct(newValue); + } + + template + explicit Optional(in_place_t, Args&&... args) noexcept( + std::is_nothrow_constructible::value) + : Optional{PrivateConstructor{}, std::forward(args)...} {} + + template + explicit Optional( + in_place_t, + std::initializer_list il, + Args&&... args) noexcept(std:: + is_nothrow_constructible< + Value, + std::initializer_list, + Args...>::value) + : Optional{PrivateConstructor{}, il, std::forward(args)...} {} + + // Used only when an Optional is used with coroutines on MSVC + /* implicit */ Optional(const detail::OptionalPromiseReturn& p) + : Optional{} { + p.promise_->value_ = this; + } + + void assign(const None&) { + clear(); + } + + void assign(Optional&& src) { + if (this != &src) { + if (src.hasValue()) { + assign(std::move(src.value())); + src.clear(); + } else { + clear(); + } + } + } + + void assign(const Optional& src) { + if (src.hasValue()) { + assign(src.value()); + } else { + clear(); + } + } + + void assign(Value&& newValue) { + if (hasValue()) { + storage_.value = std::move(newValue); + } else { + construct(std::move(newValue)); + } + } + + void assign(const Value& newValue) { + if (hasValue()) { + storage_.value = newValue; + } else { + construct(newValue); + } + } + + Optional& operator=(None) noexcept { + reset(); + return *this; + } + + template + Optional& operator=(Arg&& arg) { + assign(std::forward(arg)); + return *this; + } + + Optional& operator=(Optional&& other) noexcept( + std::is_nothrow_move_assignable::value) { + assign(std::move(other)); + return *this; + } + + Optional& operator=(const Optional& other) noexcept( + std::is_nothrow_copy_assignable::value) { + assign(other); + return *this; + } + + template + Value& emplace(Args&&... args) { + clear(); + construct(std::forward(args)...); + return value(); + } + + template + typename std::enable_if< + std::is_constructible&, Args&&...>::value, + Value&>::type + emplace(std::initializer_list ilist, Args&&... args) { + clear(); + construct(ilist, std::forward(args)...); + return value(); + } + + void reset() noexcept { + storage_.clear(); + } + + void clear() noexcept { + reset(); + } + + void swap(Optional& that) noexcept(IsNothrowSwappable::value) { + if (hasValue() && that.hasValue()) { + using std::swap; + swap(value(), that.value()); + } else if (hasValue()) { + that.emplace(std::move(value())); + reset(); + } else if (that.hasValue()) { + emplace(std::move(that.value())); + that.reset(); + } + } + + const Value& value() const& { + require_value(); + return storage_.value; + } + + Value& value() & { + require_value(); + return storage_.value; + } + + Value&& value() && { + require_value(); + return std::move(storage_.value); + } + + const Value&& value() const&& { + require_value(); + return std::move(storage_.value); + } + + const Value* get_pointer() const& { + return storage_.hasValue ? &storage_.value : nullptr; + } + Value* get_pointer() & { + return storage_.hasValue ? &storage_.value : nullptr; + } + Value* get_pointer() && = delete; + + bool has_value() const noexcept { + return storage_.hasValue; + } + + bool hasValue() const noexcept { + return has_value(); + } + + explicit operator bool() const noexcept { + return has_value(); + } + + const Value& operator*() const& { + return value(); + } + Value& operator*() & { + return value(); + } + const Value&& operator*() const&& { + return std::move(value()); + } + Value&& operator*() && { + return std::move(value()); + } + + const Value* operator->() const { + return &value(); + } + Value* operator->() { + return &value(); + } + + // Return a copy of the value if set, or a given default if not. + template + Value value_or(U&& dflt) const& { + if (storage_.hasValue) { + return storage_.value; + } + + return std::forward(dflt); + } + + template + Value value_or(U&& dflt) && { + if (storage_.hasValue) { + return std::move(storage_.value); + } + + return std::forward(dflt); + } + + private: + template + friend Optional<_t>> make_optional(T&&); + template + friend Optional make_optional(Args&&... args); + template + friend Optional make_optional(std::initializer_list, As&&...); + + /** + * Construct the optional in place, this is duplicated as a non-explicit + * constructor to allow returning values that are non-movable from + * make_optional using list initialization. + * + * Until C++17, at which point this will become unnecessary because of + * specified prvalue elision. + */ + struct PrivateConstructor { + explicit PrivateConstructor() = default; + }; + template + Optional(PrivateConstructor, Args&&... args) noexcept( + std::is_constructible::value) { + construct(std::forward(args)...); + } + + void require_value() const { + if (!storage_.hasValue) { + throw OptionalEmptyException{}; + } + } + + template + void construct(Args&&... args) { + const void* ptr = &storage_.value; + // For supporting const types. + new (const_cast(ptr)) Value(std::forward(args)...); + storage_.hasValue = true; + } + + struct StorageTriviallyDestructible { + union { + char emptyState; + Value value; + }; + bool hasValue; + + StorageTriviallyDestructible() + : emptyState('\0'), hasValue{false} {} + void clear() { + hasValue = false; + } + }; + + struct StorageNonTriviallyDestructible { + union { + char emptyState; + Value value; + }; + bool hasValue; + + StorageNonTriviallyDestructible() : hasValue{false} {} + ~StorageNonTriviallyDestructible() { + clear(); + } + + void clear() { + if (hasValue) { + hasValue = false; + value.~Value(); + } + } + }; + + using Storage = typename std::conditional< + std::is_trivially_destructible::value, + StorageTriviallyDestructible, + StorageNonTriviallyDestructible>::type; + + Storage storage_; +}; + +template +const T* get_pointer(const Optional& opt) { + return opt.get_pointer(); +} + +template +T* get_pointer(Optional& opt) { + return opt.get_pointer(); +} + +template +void swap(Optional& a, Optional& b) noexcept(noexcept(a.swap(b))) { + a.swap(b); +} + +template +Optional<_t>> make_optional(T&& v) { + using PrivateConstructor = + typename folly::Optional<_t>>::PrivateConstructor; + return {PrivateConstructor{}, std::forward(v)}; +} + +template +folly::Optional make_optional(Args&&... args) { + using PrivateConstructor = typename folly::Optional::PrivateConstructor; + return {PrivateConstructor{}, std::forward(args)...}; +} + +template +folly::Optional make_optional( + std::initializer_list il, + Args&&... args) { + using PrivateConstructor = typename folly::Optional::PrivateConstructor; + return {PrivateConstructor{}, il, std::forward(args)...}; +} + +/////////////////////////////////////////////////////////////////////////////// +// Comparisons. + +template +bool operator==(const Optional& a, const V& b) { + return a.hasValue() && a.value() == b; +} + +template +bool operator!=(const Optional& a, const V& b) { + return !(a == b); +} + +template +bool operator==(const U& a, const Optional& b) { + return b.hasValue() && b.value() == a; +} + +template +bool operator!=(const U& a, const Optional& b) { + return !(a == b); +} + +template +bool operator==(const Optional& a, const Optional& b) { + if (a.hasValue() != b.hasValue()) { + return false; + } + if (a.hasValue()) { + return a.value() == b.value(); + } + return true; +} + +template +bool operator!=(const Optional& a, const Optional& b) { + return !(a == b); +} + +template +bool operator<(const Optional& a, const Optional& b) { + if (a.hasValue() != b.hasValue()) { + return a.hasValue() < b.hasValue(); + } + if (a.hasValue()) { + return a.value() < b.value(); + } + return false; +} + +template +bool operator>(const Optional& a, const Optional& b) { + return b < a; +} + +template +bool operator<=(const Optional& a, const Optional& b) { + return !(b < a); +} + +template +bool operator>=(const Optional& a, const Optional& b) { + return !(a < b); +} + +// Suppress comparability of Optional with T, despite implicit conversion. +template +bool operator<(const Optional&, const V& other) = delete; +template +bool operator<=(const Optional&, const V& other) = delete; +template +bool operator>=(const Optional&, const V& other) = delete; +template +bool operator>(const Optional&, const V& other) = delete; +template +bool operator<(const V& other, const Optional&) = delete; +template +bool operator<=(const V& other, const Optional&) = delete; +template +bool operator>=(const V& other, const Optional&) = delete; +template +bool operator>(const V& other, const Optional&) = delete; + +// Comparisons with none +template +bool operator==(const Optional& a, None) noexcept { + return !a.hasValue(); +} +template +bool operator==(None, const Optional& a) noexcept { + return !a.hasValue(); +} +template +bool operator<(const Optional&, None) noexcept { + return false; +} +template +bool operator<(None, const Optional& a) noexcept { + return a.hasValue(); +} +template +bool operator>(const Optional& a, None) noexcept { + return a.hasValue(); +} +template +bool operator>(None, const Optional&) noexcept { + return false; +} +template +bool operator<=(None, const Optional&) noexcept { + return true; +} +template +bool operator<=(const Optional& a, None) noexcept { + return !a.hasValue(); +} +template +bool operator>=(const Optional&, None) noexcept { + return true; +} +template +bool operator>=(None, const Optional& a) noexcept { + return !a.hasValue(); +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace folly diff --git a/third-party/folly/folly/Portability.h b/third-party/folly/folly/Portability.h new file mode 100644 index 00000000000..2c6544c1961 --- /dev/null +++ b/third-party/folly/folly/Portability.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if defined(__arm__) +#define FOLLY_ARM 1 +#else +#define FOLLY_ARM 0 +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define FOLLY_X64 1 +#else +#define FOLLY_X64 0 +#endif + +#if defined(__aarch64__) +#define FOLLY_AARCH64 1 +#else +#define FOLLY_AARCH64 0 +#endif + +#if defined(__powerpc64__) +#define FOLLY_PPC64 1 +#else +#define FOLLY_PPC64 0 +#endif + +#if defined(__has_builtin) +#define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__) +#else +#define FOLLY_HAS_BUILTIN(...) 0 +#endif + +#if defined(__has_cpp_attribute) +#if __has_cpp_attribute(nodiscard) +#define FOLLY_NODISCARD [[nodiscard]] +#endif +#endif +#if !defined FOLLY_NODISCARD +#if defined(_MSC_VER) && (_MSC_VER >= 1700) +#define FOLLY_NODISCARD _Check_return_ +#elif defined(__GNUC__) +#define FOLLY_NODISCARD __attribute__((__warn_unused_result__)) +#else +#define FOLLY_NODISCARD +#endif +#endif + +namespace folly { +constexpr bool kIsArchArm = FOLLY_ARM == 1; +constexpr bool kIsArchAmd64 = FOLLY_X64 == 1; +constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1; +constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1; +} // namespace folly + +namespace folly { +#ifdef NDEBUG +constexpr auto kIsDebug = false; +#else +constexpr auto kIsDebug = true; +#endif +} // namespace folly + +namespace folly { +#if defined(_MSC_VER) +constexpr bool kIsMsvc = true; +#else +constexpr bool kIsMsvc = false; +#endif +} // namespace folly diff --git a/third-party/folly/folly/ScopeGuard.h b/third-party/folly/folly/ScopeGuard.h new file mode 100644 index 00000000000..71134406303 --- /dev/null +++ b/third-party/folly/folly/ScopeGuard.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +namespace folly { +namespace scope_guard_detail { +template +class ScopeGuardImpl { + public: + explicit ScopeGuardImpl(F&& f) : f_{std::forward(f)} {} + ~ScopeGuardImpl() { + f_(); + } + + private: + F f_; +}; + +enum class ScopeGuardEnum {}; +template >> +ScopeGuardImpl operator+(ScopeGuardEnum, Func&& func) { + return ScopeGuardImpl{std::forward(func)}; +} +} // namespace scope_guard_detail +} // namespace folly + +/** + * FB_ANONYMOUS_VARIABLE(str) introduces an identifier starting with + * str and ending with a number that varies with the line. + */ +#ifndef FB_ANONYMOUS_VARIABLE +#define FB_CONCATENATE_IMPL(s1, s2) s1##s2 +#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2) +#ifdef __COUNTER__ +#define FB_ANONYMOUS_VARIABLE(str) \ + FB_CONCATENATE(FB_CONCATENATE(FB_CONCATENATE(str, __COUNTER__), _), __LINE__) +#else +#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__) +#endif +#endif + +#ifndef SCOPE_EXIT +#define SCOPE_EXIT \ + auto FB_ANONYMOUS_VARIABLE(SCOPE_EXIT_STATE) = \ + ::folly::scope_guard_detail::ScopeGuardEnum{} + [&]() noexcept +#endif diff --git a/third-party/folly/folly/Traits.h b/third-party/folly/folly/Traits.h new file mode 100644 index 00000000000..ea7e1eb1c05 --- /dev/null +++ b/third-party/folly/folly/Traits.h @@ -0,0 +1,152 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { + +#if !defined(_MSC_VER) +template +struct is_trivially_copyable + : std::integral_constant {}; +#else +template +using is_trivially_copyable = std::is_trivially_copyable; +#endif + +/*** + * _t + * + * Instead of: + * + * using decayed = typename std::decay::type; + * + * With the C++14 standard trait aliases, we could use: + * + * using decayed = std::decay_t; + * + * Without them, we could use: + * + * using decayed = _t>; + * + * Also useful for any other library with template types having dependent + * member types named `type`, like the standard trait types. + */ +template +using _t = typename T::type; + +/** + * type_t + * + * A type alias for the first template type argument. `type_t` is useful for + * controlling class-template and function-template partial specialization. + * + * Example: + * + * template + * class Container { + * public: + * template + * Container( + * type_t()...))>, + * Args&&...); + * }; + * + * void_t + * + * A type alias for `void`. `void_t` is useful for controling class-template + * and function-template partial specialization. + * + * Example: + * + * // has_value_type::value is true if T has a nested type `value_type` + * template + * struct has_value_type + * : std::false_type {}; + * + * template + * struct has_value_type> + * : std::true_type {}; + */ + +/** + * There is a bug in libstdc++, libc++, and MSVC's STL that causes it to + * ignore unused template parameter arguments in template aliases and does not + * cause substitution failures. This defect has been recorded here: + * http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558. + * + * This causes the implementation of std::void_t to be buggy, as it is likely + * defined as something like the following: + * + * template + * using void_t = void; + * + * This causes the compiler to ignore all the template arguments and does not + * help when one wants to cause substitution failures. Rather declarations + * which have void_t in orthogonal specializations are treated as the same. + * For example, assuming the possible `T` types are only allowed to have + * either the alias `one` or `two` and never both or none: + * + * template ::one>* = nullptr> + * void foo(T&&) {} + * template ::two>* = nullptr> + * void foo(T&&) {} + * + * The second foo() will be a redefinition because it conflicts with the first + * one; void_t does not cause substitution failures - the template types are + * just ignored. + */ + +namespace traits_detail { +template +struct type_t_ { + using type = T; +}; +} // namespace traits_detail + +template +using type_t = typename traits_detail::type_t_::type; +template +using void_t = type_t; + +/** + * A type trait to remove all const volatile and reference qualifiers on a + * type T + */ +template +struct remove_cvref { + using type = + typename std::remove_cv::type>::type; +}; +template +using remove_cvref_t = typename remove_cvref::type; + +template +struct IsNothrowSwappable + : std::integral_constant< + bool, + std::is_nothrow_move_constructible::value&& noexcept( + std::swap(std::declval(), std::declval()))> {}; + +template +struct Conjunction : std::true_type {}; +template +struct Conjunction : T {}; +template +struct Conjunction + : std::conditional, T>::type {}; + +template +struct Negation : std::integral_constant {}; + +template +using index_constant = std::integral_constant; + +} // namespace folly diff --git a/third-party/folly/folly/Unit.h b/third-party/folly/folly/Unit.h new file mode 100644 index 00000000000..c8cb77e2c37 --- /dev/null +++ b/third-party/folly/folly/Unit.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +namespace folly { + +/// In functional programming, the degenerate case is often called "unit". In +/// C++, "void" is often the best analogue. However, because of the syntactic +/// special-casing required for void, it is frequently a liability for template +/// metaprogramming. So, instead of writing specializations to handle cases like +/// SomeContainer, a library author may instead rule that out and simply +/// have library users use SomeContainer. Contained values may be ignored. +/// Much easier. +/// +/// "void" is the type that admits of no values at all. It is not possible to +/// construct a value of this type. +/// "unit" is the type that admits of precisely one unique value. It is +/// possible to construct a value of this type, but it is always the same value +/// every time, so it is uninteresting. +struct Unit { + constexpr bool operator==(const Unit& /*other*/) const { + return true; + } + constexpr bool operator!=(const Unit& /*other*/) const { + return false; + } +}; + +constexpr Unit unit{}; + +template +struct lift_unit { + using type = T; +}; +template <> +struct lift_unit { + using type = Unit; +}; +template +using lift_unit_t = typename lift_unit::type; + +template +struct drop_unit { + using type = T; +}; +template <> +struct drop_unit { + using type = void; +}; +template +using drop_unit_t = typename drop_unit::type; + +} // namespace folly + diff --git a/third-party/folly/folly/Utility.h b/third-party/folly/folly/Utility.h new file mode 100644 index 00000000000..7e43bdc2f17 --- /dev/null +++ b/third-party/folly/folly/Utility.h @@ -0,0 +1,141 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { + +/** + * Backports from C++17 of: + * std::in_place_t + * std::in_place_type_t + * std::in_place_index_t + * std::in_place + * std::in_place_type + * std::in_place_index + */ + +struct in_place_tag {}; +template +struct in_place_type_tag {}; +template +struct in_place_index_tag {}; + +using in_place_t = in_place_tag (&)(in_place_tag); +template +using in_place_type_t = in_place_type_tag (&)(in_place_type_tag); +template +using in_place_index_t = in_place_index_tag (&)(in_place_index_tag); + +inline in_place_tag in_place(in_place_tag = {}) { + return {}; +} +template +inline in_place_type_tag in_place_type(in_place_type_tag = {}) { + return {}; +} +template +inline in_place_index_tag in_place_index(in_place_index_tag = {}) { + return {}; +} + +template +T exchange(T& obj, U&& new_value) { + T old_value = std::move(obj); + obj = std::forward(new_value); + return old_value; +} + +namespace utility_detail { +template +struct make_seq_cat; +template < + template class S, + typename T, + T... Ta, + T... Tb, + T... Tc> +struct make_seq_cat, S, S> { + using type = + S; +}; + +// Not parameterizing by `template class, typename` because +// clang precisely v4.0 fails to compile that. Note that clang v3.9 and v5.0 +// handle that code correctly. +// +// For this to work, `S0` is required to be `Sequence` and `S1` is required +// to be `Sequence`. + +template +struct make_seq { + template + using apply = typename make_seq_cat< + typename make_seq::template apply, + typename make_seq::template apply, + typename make_seq::template apply>::type; +}; +template <> +struct make_seq<1> { + template + using apply = S1; +}; +template <> +struct make_seq<0> { + template + using apply = S0; +}; +} // namespace utility_detail + +// TODO: Remove after upgrading to C++14 baseline + +template +struct integer_sequence { + using value_type = T; + + static constexpr std::size_t size() noexcept { + return sizeof...(Ints); + } +}; + +template +using index_sequence = integer_sequence; + +template +using make_integer_sequence = typename utility_detail::make_seq< + Size>::template apply, integer_sequence>; + +template +using make_index_sequence = make_integer_sequence; +template +using index_sequence_for = make_index_sequence; + +/** + * A simple helper for getting a constant reference to an object. + * + * Example: + * + * std::vector v{1,2,3}; + * // The following two lines are equivalent: + * auto a = const_cast&>(v).begin(); + * auto b = folly::as_const(v).begin(); + * + * Like C++17's std::as_const. See http://wg21.link/p0007 + */ +template +T const& as_const(T& t) noexcept { + return t; +} + +template +void as_const(T const&&) = delete; + +} // namespace folly diff --git a/third-party/folly/folly/chrono/Hardware.h b/third-party/folly/folly/chrono/Hardware.h new file mode 100644 index 00000000000..ec7be82e8be --- /dev/null +++ b/third-party/folly/folly/chrono/Hardware.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#if _MSC_VER +extern "C" std::uint64_t __rdtsc(); +#pragma intrinsic(__rdtsc) +#endif + +namespace folly { + +inline std::uint64_t hardware_timestamp() { +#if _MSC_VER + return __rdtsc(); +#elif __GNUC__ && (__i386__ || FOLLY_X64) + return __builtin_ia32_rdtsc(); +#else + // use steady_clock::now() as an approximation for the timestamp counter on + // non-x86 systems + return std::chrono::steady_clock::now().time_since_epoch().count(); +#endif +} + +} // namespace folly + diff --git a/third-party/folly/folly/container/Array.h b/third-party/folly/folly/container/Array.h new file mode 100644 index 00000000000..bb3167b9793 --- /dev/null +++ b/third-party/folly/folly/container/Array.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include +#include + +namespace folly { + +namespace array_detail { +template +struct is_ref_wrapper : std::false_type {}; +template +struct is_ref_wrapper> : std::true_type {}; + +template +using not_ref_wrapper = + folly::Negation::type>>; + +template +struct return_type_helper { + using type = D; +}; +template +struct return_type_helper { + static_assert( + folly::Conjunction...>::value, + "TList cannot contain reference_wrappers when D is void"); + using type = typename std::common_type::type; +}; + +template +using return_type = std:: + array::type, sizeof...(TList)>; +} // namespace array_detail + +template +constexpr array_detail::return_type make_array(TList&&... t) { + using value_type = + typename array_detail::return_type_helper::type; + return {{static_cast(std::forward(t))...}}; +} + +namespace array_detail { +template +inline constexpr auto make_array_with( + MakeItem const& make, + folly::index_sequence) + -> std::array { + return std::array{{make(Index)...}}; +} +} // namespace array_detail + +// make_array_with +// +// Constructs a std::array<..., Size> with elements m(i) for i in [0, Size). +template +constexpr auto make_array_with(MakeItem const& make) + -> decltype(array_detail::make_array_with( + make, + folly::make_index_sequence{})) { + return array_detail::make_array_with( + make, + folly::make_index_sequence{}); +} + +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex-inl.h b/third-party/folly/folly/detail/Futex-inl.h new file mode 100644 index 00000000000..3b2a412bfb6 --- /dev/null +++ b/third-party/folly/folly/detail/Futex-inl.h @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { +namespace detail { + +/** Optimal when TargetClock is the same type as Clock. + * + * Otherwise, both Clock::now() and TargetClock::now() must be invoked. */ +template +typename TargetClock::time_point time_point_conv( + std::chrono::time_point const& time) { + using std::chrono::duration_cast; + using TimePoint = std::chrono::time_point; + using TargetDuration = typename TargetClock::duration; + using TargetTimePoint = typename TargetClock::time_point; + if (time == TimePoint::max()) { + return TargetTimePoint::max(); + } else if (std::is_same::value) { + // in place of time_point_cast, which cannot compile without if-constexpr + auto const delta = time.time_since_epoch(); + return TargetTimePoint(duration_cast(delta)); + } else { + // different clocks with different epochs, so non-optimal case + auto const delta = time - Clock::now(); + return TargetClock::now() + duration_cast(delta); + } +} + +/** + * Available overloads, with definitions elsewhere + * + * These functions are treated as ADL-extension points, the templates above + * call these functions without them having being pre-declared. This works + * because ADL lookup finds the definitions of these functions when you pass + * the relevant arguments + */ +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask); +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, + uint32_t waitMask); + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask); +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, + uint32_t waitMask); + +template +typename std::enable_if::type +futexWaitImpl( + Futex* futex, + uint32_t expected, + Deadline const& deadline, + uint32_t waitMask) { + return futexWaitImpl(futex, expected, nullptr, &deadline, waitMask); +} + +template +typename std::enable_if::type +futexWaitImpl( + Futex* futex, + uint32_t expected, + Deadline const& deadline, + uint32_t waitMask) { + return futexWaitImpl(futex, expected, &deadline, nullptr, waitMask); +} + +template +FutexResult +futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask) { + auto rv = futexWaitImpl(futex, expected, nullptr, nullptr, waitMask); + assert(rv != FutexResult::TIMEDOUT); + return rv; +} + +template +int futexWake(const Futex* futex, int count, uint32_t wakeMask) { + return futexWakeImpl(futex, count, wakeMask); +} + +template +FutexResult futexWaitUntil( + const Futex* futex, + uint32_t expected, + std::chrono::time_point const& deadline, + uint32_t waitMask) { + using Target = typename std::conditional< + Clock::is_steady, + std::chrono::steady_clock, + std::chrono::system_clock>::type; + auto const converted = time_point_conv(deadline); + return converted == Target::time_point::max() + ? futexWaitImpl(futex, expected, nullptr, nullptr, waitMask) + : futexWaitImpl(futex, expected, converted, waitMask); +} + +} // namespace detail +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex.cpp b/third-party/folly/folly/detail/Futex.cpp new file mode 100644 index 00000000000..208578a901d --- /dev/null +++ b/third-party/folly/folly/detail/Futex.cpp @@ -0,0 +1,263 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __linux__ +#include +#endif + +#ifndef _WIN32 +#include +#endif + +using namespace std::chrono; + +namespace folly { +namespace detail { + +namespace { + +//////////////////////////////////////////////////// +// native implementation using the futex() syscall + +#ifdef __linux__ + +/// Certain toolchains (like Android's) don't include the full futex API in +/// their headers even though they support it. Make sure we have our constants +/// even if the headers don't have them. +#ifndef FUTEX_WAIT_BITSET +#define FUTEX_WAIT_BITSET 9 +#endif +#ifndef FUTEX_WAKE_BITSET +#define FUTEX_WAKE_BITSET 10 +#endif +#ifndef FUTEX_PRIVATE_FLAG +#define FUTEX_PRIVATE_FLAG 128 +#endif +#ifndef FUTEX_CLOCK_REALTIME +#define FUTEX_CLOCK_REALTIME 256 +#endif + +int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) { + int rv = syscall( + __NR_futex, + addr, /* addr1 */ + FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */ + count, /* val */ + nullptr, /* timeout */ + nullptr, /* addr2 */ + wakeMask); /* val3 */ + + /* NOTE: we ignore errors on wake for the case of a futex + guarding its own destruction, similar to this + glibc bug with sem_post/sem_wait: + https://sourceware.org/bugzilla/show_bug.cgi?id=12674 */ + if (rv < 0) { + return 0; + } + return rv; +} + +template +struct timespec timeSpecFromTimePoint(time_point absTime) { + auto epoch = absTime.time_since_epoch(); + if (epoch.count() < 0) { + // kernel timespec_valid requires non-negative seconds and nanos in [0,1G) + epoch = Clock::duration::zero(); + } + + // timespec-safe seconds and nanoseconds; + // chrono::{nano,}seconds are `long long int` + // whereas timespec uses smaller types + using time_t_seconds = duration; + using long_nanos = duration; + + auto secs = duration_cast(epoch); + auto nanos = duration_cast(epoch - secs); + struct timespec result = {secs.count(), nanos.count()}; + return result; +} + +FutexResult nativeFutexWaitImpl( + const void* addr, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + assert(absSystemTime == nullptr || absSteadyTime == nullptr); + + int op = FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG; + struct timespec ts; + struct timespec* timeout = nullptr; + + if (absSystemTime != nullptr) { + op |= FUTEX_CLOCK_REALTIME; + ts = timeSpecFromTimePoint(*absSystemTime); + timeout = &ts; + } else if (absSteadyTime != nullptr) { + ts = timeSpecFromTimePoint(*absSteadyTime); + timeout = &ts; + } + + // Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout + // value - http://locklessinc.com/articles/futex_cheat_sheet/ + int rv = syscall( + __NR_futex, + addr, /* addr1 */ + op, /* op */ + expected, /* val */ + timeout, /* timeout */ + nullptr, /* addr2 */ + waitMask); /* val3 */ + + if (rv == 0) { + return FutexResult::AWOKEN; + } else { + switch (errno) { + case ETIMEDOUT: + assert(timeout != nullptr); + return FutexResult::TIMEDOUT; + case EINTR: + return FutexResult::INTERRUPTED; + case EWOULDBLOCK: + return FutexResult::VALUE_CHANGED; + default: + assert(false); + // EINVAL, EACCESS, or EFAULT. EINVAL means there was an invalid + // op (should be impossible) or an invalid timeout (should have + // been sanitized by timeSpecFromTimePoint). EACCESS or EFAULT + // means *addr points to invalid memory, which is unlikely because + // the caller should have segfaulted already. We can either + // crash, or return a value that lets the process continue for + // a bit. We choose the latter. VALUE_CHANGED probably turns the + // caller into a spin lock. + return FutexResult::VALUE_CHANGED; + } + } +} + +#endif // __linux__ + +/////////////////////////////////////////////////////// +// compatibility implementation using standard C++ API + +using Lot = ParkingLot; +Lot parkingLot; + +int emulatedFutexWake(const void* addr, int count, uint32_t waitMask) { + int woken = 0; + parkingLot.unpark(addr, [&](const uint32_t& mask) { + if ((mask & waitMask) == 0) { + return UnparkControl::RetainContinue; + } + assert(count > 0); + count--; + woken++; + return count > 0 ? UnparkControl::RemoveContinue + : UnparkControl::RemoveBreak; + }); + return woken; +} + +template +FutexResult emulatedFutexWaitImpl( + F* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + static_assert( + std::is_same>::value || + std::is_same>::value, + "Type F must be either Futex or Futex"); + ParkResult res; + if (absSystemTime) { + res = parkingLot.park_until( + futex, + waitMask, + [&] { return *futex == expected; }, + [] {}, + *absSystemTime); + } else if (absSteadyTime) { + res = parkingLot.park_until( + futex, + waitMask, + [&] { return *futex == expected; }, + [] {}, + *absSteadyTime); + } else { + res = parkingLot.park( + futex, waitMask, [&] { return *futex == expected; }, [] {}); + } + switch (res) { + case ParkResult::Skip: + return FutexResult::VALUE_CHANGED; + case ParkResult::Unpark: + return FutexResult::AWOKEN; + case ParkResult::Timeout: + return FutexResult::TIMEDOUT; + } + + return FutexResult::INTERRUPTED; +} + +} // namespace + +///////////////////////////////// +// Futex<> overloads + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask) { +#ifdef __linux__ + return nativeFutexWake(futex, count, wakeMask); +#else + return emulatedFutexWake(futex, count, wakeMask); +#endif +} + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask) { + return emulatedFutexWake(futex, count, wakeMask); +} + +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { +#ifdef __linux__ + return nativeFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +#else + return emulatedFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +#endif +} + +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + return emulatedFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +} + +} // namespace detail +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex.h b/third-party/folly/folly/detail/Futex.h new file mode 100644 index 00000000000..987a1b89574 --- /dev/null +++ b/third-party/folly/folly/detail/Futex.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace folly { +namespace detail { + +enum class FutexResult { + VALUE_CHANGED, /* futex value didn't match expected */ + AWOKEN, /* wakeup by matching futex wake, or spurious wakeup */ + INTERRUPTED, /* wakeup by interrupting signal */ + TIMEDOUT, /* wakeup by expiring deadline */ +}; + +/** + * Futex is an atomic 32 bit unsigned integer that provides access to the + * futex() syscall on that value. It is templated in such a way that it + * can interact properly with DeterministicSchedule testing. + * + * If you don't know how to use futex(), you probably shouldn't be using + * this class. Even if you do know how, you should have a good reason + * (and benchmarks to back you up). + * + * Because of the semantics of the futex syscall, the futex family of + * functions are available as free functions rather than member functions + */ +template